Make plots for Deaf1b RNAseq data collected from 2dpf and 6dpf zebrafish head cuts. For 2dpf, there are three mutations (c207y, t238p, and 23bpi) and each has wt, het, and hom samples. For 6dpf, there is only the 23bpi mutation.
library("tidyverse")
library("DESeq2") #finding differentially expressed genes
library("cowplot") #arranging plots into grids
library("viridis") #viridis color schemes
library("scales") #use to get color schemes for viridis
library("ggrepel") #repels text labels on plots
library("RColorBrewer") #pick colors
library("DT") #interactive and searchable tables of our GSEA results
library("GSEABase") #functions and methods for Gene Set Enrichment Analysis
library("Biobase") #base functions for bioconductor; required by GSEABase
library("GSVA") #Gene Set Variation Analysis, a non-parametric and unsupervised method for estimating variation of gene set enrichment across samples.
library("gprofiler2") #tools for accessing the GO enrichment results using g:Profiler web resources
library("clusterProfiler") # provides a suite of tools for functional enrichment analysis
library("msigdbr") # access to msigdb collections directly within R
library("enrichplot") # great for making the standard GSEA enrichment plots
library("ontologyIndex") #for parsing obo files
library("BaseSet") #for importing gaf file
library("plotly") #make interactive plots
library("orthogene") #convert gene names between species
library("ChIPseeker") #for annotating and browsing chip-seq data
library("TxDb.Hsapiens.UCSC.hg38.knownGene", pos = .Machine$integer.max) #human gene annotation for chip-seq data
library("EnsDb.Hsapiens.v75", pos = .Machine$integer.max) #human gene annotation for chip-seq data
library("org.Dr.eg.db") #convert ensembl gene names to gene id
library("lattice") #used for making manhattan plot
library("ggpubr") #calculating correlation coefficient
library("colorspace") #colors for heatmap
library("ggraph") #library for making network graphs
library("svglite") #export svgs in loop#import sample file with columns containing sample name, file name, condition, and replicate
sampleTable <- read_csv("input/input_deaf1together.csv")
#make conditions a factor
sampleTable$condition <- factor(sampleTable$condition, levels = c("d23bpi_wt", "dc207y_wt", "dt238p_wt", "d23bpi_het", "dc207y_het", "dt238p_het", "d23bpi_hom", "dc207y_hom", "dt238p_hom"))
#create dds based on sample data
ddsHTSeq <-DESeqDataSetFromHTSeqCount(sampleTable=sampleTable, design=~condition)
dds <-DESeq(ddsHTSeq)
#prefilter (only keep genes that don't have counts of 0 in more than 3 samples)
keep <- rowSums(counts(dds) == 0) < 4
dds <- dds[keep,]
#export normalized counts for all samples
#each gene (with llgene ID, gene name, and entrez ID) and then normalized counts for each sample
countsdata <-counts(dds,normalized=TRUE)
namelist <-read.csv("input/LLgeneID_entrezID.csv")
##want to merge countsdata with namelist and gene_name
rownames(namelist) <- namelist[,1]
gene_name <-read.csv("input/llgeneid_genename.csv") ##this file has LLgeneID as column 1 and LLgeneAbbrev/gene name as column 2
rownames(gene_name) <- gene_name[,1]
genecounts<-merge(gene_name,namelist,by=0)
rownames(genecounts)<-genecounts[,1]
genecounts<-merge(genecounts,countsdata,by=0)
genecounts<-genecounts[,-(1:3)]
#export csv with normalized gene counts for each gene and each sample
write.csv(genecounts,file = "comparisons/normalized_reads_gene_list_2dpf.csv")
#plot PCA with all samples
plotPCA(rlog(dds),"condition")The 2dpf samples cluster by clutch (ie, het, hom, and wt siblings together) rather than different mutants clustering together.
This uses a loop to make planned comparisons between wt/hom, wt/het, and het/hom for all three 2 dpf mutants. It makes one csv file per comparison and one summary file with all comparisons (“deaf1DEGdata_2dpf”).
#import data frame with the comparisons that we want to make
#should have "name" of comparison, first group for comparison, and second group for comparison
#"wt" group is in the third columnm,
deaf1comparisons <- read.csv("input/deaf1_plannedcomparisons.csv")
#make an empty table to hold data
deaf1DEGdata <- setNames(data.frame(matrix(ncol = 9, nrow = 0)), c("LLgeneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj", "LLgeneAbbrev", "comparison"))
#make a loop that performs each comparison
for (x in 1:length(deaf1comparisons$comparison.name)) {
experimental <- deaf1comparisons[x,2] #name of experimental group
control <- deaf1comparisons[x,3] #name of control group
comparisonname <- deaf1comparisons[x,1] #name of comparison
res <- results(dds, contrast=c("condition",experimental,control))
res<-res[order(res$pvalue),]
rlog<-rlog(dds)
vst <-vst(dds)
dataframe_res <-as.data.frame(res)
dataframe_res$LLgeneID<-row.names(dataframe_res)
dataframe_res<-dataframe_res[c(7,1:6)]
res_gene <-inner_join(dataframe_res,gene_name,by="LLgeneID")
res_gene<-res_gene[!is.na(res_gene$padj),]
res_gene$comparison <- comparisonname #add column with comparison name
write.csv(res_gene,file=paste0("comparisons/deaf1_", comparisonname, ".csv")) #export file
deaf1DEGdata <- rbind(deaf1DEGdata, res_gene) #add comparison to table with all data
}
#export table with all comparisons
write.csv(deaf1DEGdata, "comparisons/deaf1DEGdata_2dpf.csv")
#make a copy of the 2dpf data
deaf1DEGdata_2dpf <- deaf1DEGdata#import sample file with columns containing sample name, file name, condition, and replicate
sampleTable_6dpf <- read_csv("input/input_deaf1together_6dpf.csv")
#make conditions a factor
sampleTable_6dpf$condition <- factor(sampleTable_6dpf$condition, levels = c("d23bpi_wt_6dpf", "d23bpi_het_6dpf", "d23bpi_hom_6dpf"))
#create dds based on sample data
ddsHTSeq_6dpf <-DESeqDataSetFromHTSeqCount(sampleTable=sampleTable_6dpf, design=~condition)
dds_6dpf <-DESeq(ddsHTSeq_6dpf)
#prefilter
keep <- rowSums(counts(dds_6dpf) == 0) < 4
dds_6dpf <- dds_6dpf[keep,]
#export normalized counts for all samples
#each gene (with llgene ID, gene name, and entrez ID) and then normalized counts for each sample
countsdata_6dpf <-counts(dds_6dpf,normalized=TRUE)
genecounts_6dpf<-merge(gene_name,namelist,by=0)
rownames(genecounts_6dpf)<-genecounts_6dpf[,1]
genecounts_6dpf<-merge(genecounts_6dpf,countsdata_6dpf,by=0)
genecounts_6dpf<-genecounts_6dpf[,-(1:3)]
#export csv with normalized gene counts for each gene and each sample
write.csv(genecounts_6dpf,file = "comparisons/normalized_reads_gene_list_6dpf.csv")
#plot PCA with all samples
plotPCA(rlog(dds_6dpf),"condition")
#add rownames to genecounts
rownames(genecounts) <- genecounts$LLgeneID.y
#add rownames to genecounts_6dpf
rownames(genecounts_6dpf) <- genecounts_6dpf$LLgeneID.y
#merge 2dpf and 6dpf counts
genecounts <- merge(genecounts,genecounts_6dpf, by="row.names", all=TRUE)
#if NA in LLgeneAbbrev.x, get value from LLgeneAbbrev.y
genecounts <- genecounts %>% mutate(LLgeneAbbrev.x = ifelse(is.na(LLgeneAbbrev.x), LLgeneAbbrev.y, LLgeneAbbrev.x))
genecounts <- genecounts %>% mutate(LLgeneID.y.x = ifelse(is.na(LLgeneID.y.x), LLgeneID.y.y, LLgeneID.y.x))
genecounts <- genecounts %>% mutate(EntrezGeneID.x = ifelse(is.na(EntrezGeneID.x), EntrezGeneID.y, EntrezGeneID.x))
#get rid of extra columns
genecounts <- genecounts %>% dplyr::select(-c(Row.names, LLgeneAbbrev.y, LLgeneID.y.y, EntrezGeneID.y))
#rename columns
colnames(genecounts)[1:3] <- c("LLgeneAbbrev", "LLgeneID.y", "EntrezGeneID")
#export csv with normalized gene counts for both 2dpf and 6dpf
write.csv(genecounts,file = "comparisons/normalized_reads_gene_list_2dpf-6dpf.csv")This uses a loop to make planned comparisons between wt/hom, wt/het, and het/hom for 23bp insertion 6dpf mutant. It makes one csv file per comparison and one summary file with all comparisons (“deaf1DEGdata”).
#import data frame with the comparisons that we want to make
#should have "name" of comparison, first group for comparison, and second group for comparison
#"wt" group is in the third column
deaf1comparisons_6dpf <- read.csv("input/deaf1_plannedcomparisons_6dpf.csv")
#make an empty table to hold data
deaf1DEGdata_6dpf <- setNames(data.frame(matrix(ncol = 9, nrow = 0)), c("LLgeneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj", "LLgeneAbbrev", "comparison"))
#make a loop that performs each comparison
for (x in 1:length(deaf1comparisons_6dpf$comparison.name)) {
experimental <- deaf1comparisons_6dpf[x,2] #name of experimental group
control <- deaf1comparisons_6dpf[x,3] #name of control group
comparisonname <- deaf1comparisons_6dpf[x,1] #name of comparison
res <- results(dds_6dpf, contrast=c("condition",experimental,control))
res<-res[order(res$pvalue),]
rlog<-rlog(dds_6dpf)
vst <-vst(dds_6dpf)
dataframe_res <-as.data.frame(res)
dataframe_res$LLgeneID<-row.names(dataframe_res)
dataframe_res<-dataframe_res[c(7,1:6)]
res_gene <-inner_join(dataframe_res,gene_name,by="LLgeneID")
res_gene<-res_gene[!is.na(res_gene$padj),]
res_gene$comparison <- comparisonname #add column with comparison name
write.csv(res_gene,file=paste0("comparisons/deaf1_", comparisonname, ".csv")) #export file
deaf1DEGdata_6dpf <- rbind(deaf1DEGdata_6dpf, res_gene) #add comparison to table with all data
}
#export table with all comparisons
write.csv(deaf1DEGdata_6dpf, "comparisons/deaf1DEGdata_6dpf.csv")
#combine 2dpf and 6dpf comparisons
deaf1DEGdata <- rbind(deaf1DEGdata_2dpf, deaf1DEGdata_6dpf)
#add some information to deaf1DEGdata
#limit padj to 1e-10
deaf1DEGdata <- deaf1DEGdata %>% mutate(padjbound = ifelse(deaf1DEGdata$padj < 1e-10, 1e-10, deaf1DEGdata$padj))
#limit pvalue to 1e-10
deaf1DEGdata <- deaf1DEGdata %>% mutate(pvaluebound = ifelse(deaf1DEGdata$pvalue < 1e-10, 1e-10, deaf1DEGdata$pvalue))
#import annotations for chromosome and TSS
chromosomeinfo <- readr::read_tsv(file="ncbi_refseqgenes")
#keep chromosome and gene name
chromosomeinfo <- chromosomeinfo %>% dplyr::select(chrom, name2, txStart)
deaf1DEGdata$chrom <- NA
deaf1DEGdata$txStart <- NA
#add chromosome and txStart information to deaf1DEGdata
for (x in 1:length(deaf1DEGdata$LLgeneAbbrev)) {
gene <- deaf1DEGdata$LLgeneAbbrev[x]
chromosomeinfosub <- chromosomeinfo %>% dplyr::filter(name2 == gene)
chr <- chromosomeinfosub$chrom[1]
txStart <- chromosomeinfosub$txStart[1]
deaf1DEGdata$chrom[x] <- chr
deaf1DEGdata$txStart[x] <- txStart
}
#add a column with chromosome number
deaf1DEGdata <- deaf1DEGdata %>% mutate(chromosomenumber = str_replace_all(chrom, c("chr1"="1", "chr2"="2", "chr3"="3", "chr4"="4", "chr5"="5", "chr6"="6", "chr7"="7", "chr8"="8", "chr9"="9", "chr10"="10", "chr11"="11", "chr12"="12", "chr13"="13", "chr14"="14", "chr15"="15", "chr16"="16", "chr17"="17", "chr18"="18", "chr19"="19", "chr20"="20", "chr21"="21", "chr22"="22", "chr23"="23", "chr24"="24", "chr25"="25")))
deaf1DEGdata <- distinct(deaf1DEGdata)
#export csv with DEG for both 2dpf and 6dpf
write.csv(deaf1DEGdata,file = "comparisons/Deaf1_DEG_2dpf-6dpf.csv")Are differentially expressed genes located at a particular place in the genome?
This function is from: https://genome.sph.umich.edu/wiki/Code_Sample:_Generating_Manhattan_Plots_in_R
#import comparisons (to skip running previous steps)
deaf1DEGdata <- read.csv("comparisons/Deaf1_DEG_2dpf-6dpf.csv")[,2:15]
#import normalized counts (to skip running previous steps)
genecounts <- read.csv("comparisons/normalized_reads_gene_list_2dpf-6dpf.csv")[,2:57]
#function for making manhattan plots
manhattan.plot<-function(chr, pos, pvalue,
sig.level=NA, annotate=NULL, ann.default=list(),
should.thin=T, thin.pos.places=2, thin.logp.places=2,
xlab="Chromosome", ylab=expression(-log[10](p-value)),
col=c("gray","darkgray"), panel.extra=NULL, pch=20, cex=0.8,...) {
if (length(chr)==0) stop("chromosome vector is empty")
if (length(pos)==0) stop("position vector is empty")
if (length(pvalue)==0) stop("pvalue vector is empty")
#make sure we have an ordered factor
if(!is.ordered(chr)) {
chr <- ordered(chr)
} else {
chr <- chr[,drop=T]
}
#make sure positions are in kbp
if (any(pos>1e6)) pos<-pos/1e6;
#calculate absolute genomic position
#from relative chromosomal positions
posmin <- tapply(pos,chr, min);
posmax <- tapply(pos,chr, max);
posshift <- head(c(0,cumsum(posmax)),-1);
names(posshift) <- levels(chr)
genpos <- pos + posshift[chr];
getGenPos<-function(cchr, cpos) {
p<-posshift[as.character(cchr)]+cpos
return(p)
}
#parse annotations
grp <- NULL
ann.settings <- list()
label.default<-list(x="peak",y="peak",adj=NULL, pos=3, offset=0.5,
col=NULL, fontface=NULL, fontsize=NULL, show=F)
parse.label<-function(rawval, groupname) {
r<-list(text=groupname)
if(is.logical(rawval)) {
if(!rawval) {r$show <- F}
} else if (is.character(rawval) || is.expression(rawval)) {
if(nchar(rawval)>=1) {
r$text <- rawval
}
} else if (is.list(rawval)) {
r <- modifyList(r, rawval)
}
return(r)
}
if(!is.null(annotate)) {
if (is.list(annotate)) {
grp <- annotate[[1]]
} else {
grp <- annotate
}
if (!is.factor(grp)) {
grp <- factor(grp)
}
} else {
grp <- factor(rep(1, times=length(pvalue)))
}
ann.settings<-vector("list", length(levels(grp)))
ann.settings[[1]]<-list(pch=pch, col=col, cex=cex, fill=col, label=label.default)
if (length(ann.settings)>1) {
lcols<-trellis.par.get("superpose.symbol")$col
lfills<-trellis.par.get("superpose.symbol")$fill
for(i in 2:length(levels(grp))) {
ann.settings[[i]]<-list(pch=pch,
col=lcols[(i-2) %% length(lcols) +1 ],
fill=lfills[(i-2) %% length(lfills) +1 ],
cex=cex, label=label.default);
ann.settings[[i]]$label$show <- T
}
names(ann.settings)<-levels(grp)
}
for(i in 1:length(ann.settings)) {
if (i>1) {ann.settings[[i]] <- modifyList(ann.settings[[i]], ann.default)}
ann.settings[[i]]$label <- modifyList(ann.settings[[i]]$label,
parse.label(ann.settings[[i]]$label, levels(grp)[i]))
}
if(is.list(annotate) && length(annotate)>1) {
user.cols <- 2:length(annotate)
ann.cols <- c()
if(!is.null(names(annotate[-1])) && all(names(annotate[-1])!="")) {
ann.cols<-match(names(annotate)[-1], names(ann.settings))
} else {
ann.cols<-user.cols-1
}
for(i in seq_along(user.cols)) {
if(!is.null(annotate[[user.cols[i]]]$label)) {
annotate[[user.cols[i]]]$label<-parse.label(annotate[[user.cols[i]]]$label,
levels(grp)[ann.cols[i]])
}
ann.settings[[ann.cols[i]]]<-modifyList(ann.settings[[ann.cols[i]]],
annotate[[user.cols[i]]])
}
}
rm(annotate)
#reduce number of points plotted
if(should.thin) {
thinned <- unique(data.frame(
logp=round(-log10(pvalue),thin.logp.places),
pos=round(genpos,thin.pos.places),
chr=chr,
grp=grp)
)
logp <- thinned$logp
genpos <- thinned$pos
chr <- thinned$chr
grp <- thinned$grp
rm(thinned)
} else {
logp <- -log10(pvalue)
}
rm(pos, pvalue)
gc()
#custom axis to print chromosome names
axis.chr <- function(side,...) {
if(side=="bottom") {
panel.axis(side=side, outside=T,
at=((posmax+posmin)/2+posshift),
labels=levels(chr),
ticks=F, rot=0,
check.overlap=F
)
} else if (side=="top" || side=="right") {
panel.axis(side=side, draw.labels=F, ticks=F);
}
else {
axis.default(side=side,...);
}
}
#make sure the y-lim covers the range (plus a bit more to look nice)
prepanel.chr<-function(x,y,...) {
A<-list();
maxy<-ceiling(max(y, ifelse(!is.na(sig.level), -log10(sig.level), 0)))+.5;
A$ylim=c(0,maxy);
A;
}
xyplot(logp~genpos, chr=chr, groups=grp,
axis=axis.chr, ann.settings=ann.settings,
prepanel=prepanel.chr, scales=list(axs="i"),
panel=function(x, y, ..., getgenpos) {
if(!is.na(sig.level)) {
#add significance line (if requested)
panel.abline(h=-log10(sig.level), lty=2);
}
panel.superpose(x, y, ..., getgenpos=getgenpos);
if(!is.null(panel.extra)) {
panel.extra(x,y, getgenpos, ...)
}
},
panel.groups = function(x,y,..., subscripts, group.number) {
A<-list(...)
#allow for different annotation settings
gs <- ann.settings[[group.number]]
A$col.symbol <- gs$col[(as.numeric(chr[subscripts])-1) %% length(gs$col) + 1]
A$cex <- gs$cex[(as.numeric(chr[subscripts])-1) %% length(gs$cex) + 1]
A$pch <- gs$pch[(as.numeric(chr[subscripts])-1) %% length(gs$pch) + 1]
A$fill <- gs$fill[(as.numeric(chr[subscripts])-1) %% length(gs$fill) + 1]
A$x <- x
A$y <- y
do.call("panel.xyplot", A)
#draw labels (if requested)
if(gs$label$show) {
gt<-gs$label
names(gt)[which(names(gt)=="text")]<-"labels"
gt$show<-NULL
if(is.character(gt$x) | is.character(gt$y)) {
peak = which.max(y)
center = mean(range(x))
if (is.character(gt$x)) {
if(gt$x=="peak") {gt$x<-x[peak]}
if(gt$x=="center") {gt$x<-center}
}
if (is.character(gt$y)) {
if(gt$y=="peak") {gt$y<-y[peak]}
}
}
if(is.list(gt$x)) {
gt$x<-A$getgenpos(gt$x[[1]],gt$x[[2]])
}
do.call("panel.text", gt)
}
},
xlab=xlab, ylab=ylab,
panel.extra=panel.extra, getgenpos=getGenPos, ...
);
}
#select columns to include in manhattan plot
myTopHits.df <- deaf1DEGdata %>% dplyr::select(chromosomenumber, txStart, pvaluebound, comparison)
#filter to only include chromosomes 1-25
myTopHits.df <- myTopHits.df %>% dplyr::filter(chromosomenumber %in% 1:25)
#make chromosomes plotted in order
myTopHits.df$chromosomenumber <- factor(myTopHits.df$chromosomenumber, levels = 1:25)
#omit NA values
myTopHits.df <- na.omit(myTopHits.df)
#make colors
manhattancol <- c(replicate(12, c("lightgrey", "darkgrey")), "#21908CFF")
#filter comparisons to only include 23bp hom v wt comparisons
manhattan <- myTopHits.df %>% dplyr::filter(comparison == "d23bp_homvwt")
#make manhattan plot
manhattan.plot(manhattan$chromosomenumber, manhattan$txStart, manhattan$pvaluebound, should.thin=F, col=manhattancol)#filter comparisons to only include one comparison
manhattan <- myTopHits.df %>% dplyr::filter(comparison == "dc207y_homvwt")
#make manhattan plot
manhattan.plot(manhattan$chromosomenumber, manhattan$txStart, manhattan$pvaluebound, should.thin=F, col=manhattancol)#filter comparisons to only include one comparison
manhattan <- myTopHits.df %>% dplyr::filter(comparison == "dt238p_homvwt")
#make manhattan plot
manhattan.plot(manhattan$chromosomenumber, manhattan$txStart, manhattan$pvaluebound, should.thin=F, col=manhattancol)#filter comparisons to only include one comparison
manhattan <- myTopHits.df %>% dplyr::filter(comparison == "d23bp_homvwt_6dpf")
#make manhattan plot
manhattan.plot(manhattan$chromosomenumber, manhattan$txStart, manhattan$pvaluebound, should.thin=F, col=manhattancol)#import DEAF1 mouse hippocampus
DEAF1mousedata <- read.csv("comparisonrnaseq/deaf1mousehippocampus.csv")
#keep only columns for manhattan
DEAF1mousedata <- DEAF1mousedata %>% dplyr::select(chromosome_name, start_position, PValue)
#make chromosomes plotted in order
DEAF1mousedata$chromosome_name <- factor(DEAF1mousedata$chromosome_name, levels = c(1:19, "X", "Y", "MT"))
#omit NA values
DEAF1mousedata <- na.omit(DEAF1mousedata)
#make colors for mouse manhattan
manhattancol_mouse <- c(replicate(3, c("lightgrey", "darkgrey")), "#21908CFF", replicate(8, c("lightgrey", "darkgrey")))
#make manhattan plot
manhattan.plot(DEAF1mousedata$chromosome_name, DEAF1mousedata$start_position, DEAF1mousedata$PValue, should.thin=F, col=manhattancol_mouse)Genes on chromosome 25 look overrepresented compared to genes on other chromosomes. Next will test whether these genes are formally overrepresented using GSEA.
#import annotations for chromosome and gene name
chromosomeinfo <- readr::read_tsv(file="ncbi_refseqgenes")
#keep chromosome and gene name
chromosomeinfo <- chromosomeinfo %>% dplyr::select(chrom, name2)
#change column names
colnames(chromosomeinfo) <- c("chromosome", "genename")
#select only chromosomes 1-25
chromosomeinfo <- chromosomeinfo %>% dplyr::filter(chromosome %in% paste("chr", 1:25, sep=""))
#keep only distinct rows
chromosomeinfo <- distinct(chromosomeinfo)# Perform GSEA using clusterProfiler
#filter to only one comparison
GSEAgenes <- deaf1DEGdata %>% dplyr::filter(comparison == "d23bp_homvwt")
#keep only the columns we need for GSEA
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#sort by abs(foldchange)
mydata.df.sub$log2FoldChange <- abs(mydata.df.sub$log2FoldChange)
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA using the 'GSEA' function from clusterProfiler
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=chromosomeinfo, verbose=FALSE, scoreType="pos", maxGSSize=2000)
myGSEA.df <- as_tibble(myGSEA.res@result)
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)# create enrichment plots using the enrichplot package
gseaplot2(myGSEA.res,
geneSetID = c(1), #can choose multiple signatures to overlay in this plot
pvalue_table = FALSE, #can set this to FALSE for a cleaner plot
title = myGSEA.res$Description[1]) #can also turn off this title# Perform GSEA using clusterProfiler
#filter to only one comparison
GSEAgenes <- deaf1DEGdata %>% dplyr::filter(comparison == "dc207y_homvwt")
#keep only the columns we need for GSEA
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#sort by abs(foldchange)
mydata.df.sub$log2FoldChange <- abs(mydata.df.sub$log2FoldChange)
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA using the 'GSEA' function from clusterProfiler
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=chromosomeinfo, verbose=FALSE, scoreType="pos", maxGSSize=2000)
myGSEA.df <- as_tibble(myGSEA.res@result)
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)# create enrichment plots using the enrichplot package
gseaplot2(myGSEA.res,
geneSetID = c(1), #can choose multiple signatures to overlay in this plot
pvalue_table = FALSE, #can set this to FALSE for a cleaner plot
title = myGSEA.res$Description[1]) #can also turn off this title# Perform GSEA using clusterProfiler
#filter to only one comparison
GSEAgenes <- deaf1DEGdata %>% dplyr::filter(comparison == "dt238p_homvwt")
#keep only the columns we need for GSEA
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#sort by abs(foldchange)
mydata.df.sub$log2FoldChange <- abs(mydata.df.sub$log2FoldChange)
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA using the 'GSEA' function from clusterProfiler
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=chromosomeinfo, verbose=FALSE, scoreType="pos", maxGSSize=2000)
myGSEA.df <- as_tibble(myGSEA.res@result)
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)# create enrichment plots using the enrichplot package
gseaplot2(myGSEA.res,
geneSetID = c(1), #can choose multiple signatures to overlay in this plot
pvalue_table = FALSE, #can set this to FALSE for a cleaner plot
title = myGSEA.res$Description[1]) #can also turn off this title# Perform GSEA using clusterProfiler
#filter to only one comparison
GSEAgenes <- deaf1DEGdata %>% dplyr::filter(comparison == "d23bp_homvwt_6dpf")
#keep only the columns we need for GSEA
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#sort by abs(foldchange)
mydata.df.sub$log2FoldChange <- abs(mydata.df.sub$log2FoldChange)
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA using the 'GSEA' function from clusterProfiler
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=chromosomeinfo, verbose=FALSE, scoreType="pos", maxGSSize=2000)
myGSEA.df <- as_tibble(myGSEA.res@result)
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)# create enrichment plots using the enrichplot package
gseaplot2(myGSEA.res,
geneSetID = c(1), #can choose multiple signatures to overlay in this plot
pvalue_table = FALSE, #can set this to FALSE for a cleaner plot
title = myGSEA.res$Description[1]) #can also turn off this title#import DEAF1 mouse hippocampus
GSEAgenes <- read.csv("comparisonrnaseq/deaf1mousehippocampus.csv")
#keep only the columns we need for GSEA
mydata.df.sub <- dplyr::select(GSEAgenes, Symbol, FC_NKOvsControl)
#abs(foldchange)
mydata.df.sub$FC_NKOvsControl <- abs(mydata.df.sub$FC_NKOvsControl)
# construct a named vector
mydata.gsea <- mydata.df.sub$FC_NKOvsControl
names(mydata.gsea) <- as.character(mydata.df.sub$Symbol)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
#import annotations for mouse chromosome and gene name
chromosomeinfo <- readr::read_tsv(file="ncbirefseqgenes_mouse")
#keep chromosome and gene name
chromosomeinfo <- chromosomeinfo %>% dplyr::select(chrom, name2)
#change column names
colnames(chromosomeinfo) <- c("chromosome", "genename")
#select only chromosomes 1-19 + XYM
chromosomeinfo <- chromosomeinfo %>% dplyr::filter(chromosome %in% c(paste("chr", 1:19, sep=""), "chrM", "chrX", "chrY"))
#keep only distinct rows
chromosomeinfo <- distinct(chromosomeinfo)
# run GSEA using the 'GSEA' function from clusterProfiler
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=chromosomeinfo, verbose=FALSE, scoreType="pos", maxGSSize=20000, pvalueCutoff = 1, pAdjustMethod="none")
myGSEA.df <- as_tibble(myGSEA.res@result)
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)Genes on chromosome 7 are not statistically significantly enriched in the DEAF1 RNAseq data.
#keep only 2dpf comparisons
myTopHits.df <- deaf1DEGdata %>% dplyr::filter(comparison %in% c("d23bp_homvwt", "dc207y_homvwt", "dt238p_homvwt"))
#filter to only include those with logfc > 1
myTopHits.df <- myTopHits.df %>% dplyr::filter(abs(log2FoldChange) > 1)
#get names of genes with logfc > 1
genestoplot <- unique(myTopHits.df$LLgeneAbbrev)
#filter to only include 2dpf homvwt comparisons
myTopHits.df <- deaf1DEGdata %>% dplyr::filter(comparison %in% c("d23bp_homvwt", "dc207y_homvwt", "dt238p_homvwt"))
#filter to only include genes on chr 25
myTopHits.df <- myTopHits.df %>% dplyr::filter(chrom == "chr25")
#change back to fold change
myTopHits.df <- myTopHits.df %>% mutate(foldchange = 2^log2FoldChange)
#change the order of the genes
myTopHits.df <- myTopHits.df %>% arrange(txStart)
#filter to include only those with log2fc >1
myTopHits.df <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% genestoplot)
#order comparisons
myTopHits.df$comparison <- factor(myTopHits.df$comparison, levels = c("d23bp_homvwt", "dc207y_homvwt", "dt238p_homvwt"))
#rename comparisons
levels(myTopHits.df$comparison) <- c("23d46i 2dpf", "c207y 2dpf", "t238p 2dpf")
#make genes into a factor
myTopHits.df$LLgeneAbbrev <- factor(myTopHits.df$LLgeneAbbrev, levels=unique(myTopHits.df$LLgeneAbbrev))
plot <- ggplot(myTopHits.df, aes(y=log2FoldChange, x=LLgeneAbbrev, fill=factor(ifelse(LLgeneAbbrev == "deaf1", "Highlighted", "Normal")))) +
geom_bar(position="dodge", stat="summary", ) +
ylab("log2 fold change") +
theme_bw() +
facet_wrap(~comparison, ncol=1) +
theme(axis.text.x = element_text(angle = 45, hjust=1), legend.position = "none", axis.title.x=element_blank()) +
scale_fill_manual(values = c("#21908CFF", "gray35"))
plot#filter to only include 2dpf homvwt comparisons
myTopHits.df <- deaf1DEGdata %>% dplyr::filter(comparison %in% c("d23bp_homvwt", "dc207y_homvwt", "dt238p_homvwt"))
#filter to only include genes on chr 25
myTopHits.df <- myTopHits.df %>% dplyr::filter(chrom == "chr25")
#change the order of the genes
myTopHits.df <- myTopHits.df %>% arrange(txStart)
#order comparisons
myTopHits.df$comparison <- factor(myTopHits.df$comparison, levels = c("d23bp_homvwt", "dc207y_homvwt", "dt238p_homvwt"))
#rename comparisons
levels(myTopHits.df$comparison) <- c("23d46i 2dpf", "c207y 2dpf", "t238p 2dpf")
chr25plot <- ggplot(myTopHits.df, aes(y=log2FoldChange, x=txStart, color=factor(ifelse(LLgeneAbbrev == "deaf1", "Highlighted", "Normal")))) +
geom_bar(stat="summary" ) +
ylab("log2 fold change") +
xlab("TSS base position") +
theme_bw() +
facet_wrap(~comparison, ncol=1) +
theme(axis.text.x = element_text(angle = 45, hjust=1), legend.position = "none") +
scale_color_manual(values = c("#21908CFF", "gray35")) +
scale_x_continuous(n.breaks = 20, labels = scales::comma, limits = c(0, 38000000), expand = c(0, 0))
chr25plotIt looks like there’s a cluster of histone genes on the distal end of chromosome 25 that is upregulated in 23bp and downregulated in t238p.
#filter to only include homvwt comparisons
myTopHits.df <- deaf1DEGdata %>% dplyr::filter(comparison %in% c("d23bp_homvwt", "dc207y_homvwt", "dt238p_homvwt"))
#filter to only include genes on chr 25
myTopHits.df <- myTopHits.df %>% dplyr::filter(chrom == "chr25")
#filter to only include those with pval <0.05
myTopHits.df <- myTopHits.df %>% dplyr::filter(pvalue < 0.05)
#get list of genes
genesofinterest <- unique(myTopHits.df$LLgeneAbbrev)
#get data for list of genes
myTopHits.df <- deaf1DEGdata %>% dplyr::filter(LLgeneAbbrev %in% genesofinterest)
#filter to only include homvwt comparisons
myTopHits.df <- myTopHits.df %>% dplyr::filter(comparison %in% c("d23bp_homvwt", "dc207y_homvwt", "dt238p_homvwt"))
#keep only the columns needed to make a heat map
myTopHits.df <- myTopHits.df %>% dplyr::select(LLgeneAbbrev, comparison, log2FoldChange)
#pivot wider
myTopHits.df <- myTopHits.df %>% pivot_wider(names_from=comparison, values_from=log2FoldChange, id_cols = LLgeneAbbrev, values_fn = ~ mean(.x, na.rm = TRUE))
#filter to keep those all + and >0.3 in two conditions
keepup <- rowSums(myTopHits.df[,2:4] > 0) > 2
keepup <- myTopHits.df[keepup,]
keepupfold <- rowSums(keepup[,2:4] > 0.3) > 1
keepup <- keepup[keepupfold,]
keepup <- na.omit(keepup)
#filter to keep those all - and <-0.3 in two conditions
keepdown <- rowSums(myTopHits.df[,2:4] < 0) > 2
keepdown <- myTopHits.df[keepdown,]
keepdownfold <- rowSums(keepdown[,2:4] < -0.3) > 1
keepdown <- keepdown[keepdownfold,]
keepdown <- na.omit(keepdown)
#make list of genes to include
chr25genestokeep <- c(keepdown$LLgeneAbbrev, keepup$LLgeneAbbrev)
#filter to only include homvwt comparisons
myTopHits.df <- deaf1DEGdata %>% dplyr::filter(comparison %in% c("d23bp_homvwt", "dc207y_homvwt", "dt238p_homvwt"))
#filter to only include genes on chr 25
myTopHits.df <- myTopHits.df %>% dplyr::filter(chrom == "chr25")
#make list of all chr25 genes
chr25genes <- unique(myTopHits.df$LLgeneAbbrev)
#get rid of tes (had NA value)
chr25genestokeep <- setdiff(chr25genestokeep, "tes")
#chr25 genes to get rid of
chr25genes <- setdiff(chr25genes, chr25genestokeep)
chr25genestokeep## [1] "ptpn9a" "muc5.1" "saal1" "hdac10" "zgc:153293"
## [6] "morc2" "ppib" "fam185a" "tnnt2d" "cdkn1bb"
## [11] "zgc:193812" "crabp1a" "lrrc29" "pamr1" "pdia3"
## [16] "magi2b" "abtb2b" "zgc:154077" "ccdc113" "cpne8"
## [21] "muc5.2" "LOC110438378" "snupn" "cav1" "btbd10a"
## [26] "parvaa" "galk2" "dkk3b" "dusp6" "vac14"
## [31] "tmem17" "kitlga" "tbc1d2b" "bbs4" "lrrk2"
## [36] "e2f4" "tnni2a.2" "mob2a" "deaf1"
#comparisons to include in plot
comparisons <- c("dt238p_homvwt", "dc207y_homvwt", "d23bp_homvwt_6dpf", "d23bp_homvwt")
#filter to only include genes of interest
bubbleDEG <- deaf1DEGdata %>% dplyr::filter(LLgeneAbbrev %in% chr25genestokeep & comparison %in% comparisons)
#find middle of foldchange (to center color scale)
limit <- max(abs(bubbleDEG$log2FoldChange)) * c(-1, 1)
#get color
myheatcolors3 <- brewer.pal(name="RdBu", n=11)
#set order of genes along x axis
geneorder <- c(rev(sort(keepup$LLgeneAbbrev)), rev(sort(keepdown$LLgeneAbbrev)))
#order genes
bubbleDEG$LLgeneAbbrev <- factor(bubbleDEG$LLgeneAbbrev, levels = geneorder)
#order comparisons
bubbleDEG$comparison <- factor(bubbleDEG$comparison, levels = rev(c("d23bp_homvwt_6dpf", "d23bp_homvwt", "dt238p_homvwt", "dc207y_homvwt")))
#rename comparisons
levels(bubbleDEG$comparison) <- rev(c("23d46i 6dpf", "23d46i 2dpf", "t238p 2dpf", "c207y 2dpf"))
#rename key legends
colnames(bubbleDEG) <- c("LLgeneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padjn", "LLgeneAbbrev", "comparison", "padj")
# create 'bubble plot' to summarize y signatures across x phenotypes
bubbleplot_chr25 <- ggplot(bubbleDEG, aes(x=LLgeneAbbrev, y=comparison)) +
geom_point(aes(size=-log10(padj), color = log2FoldChange)) +
scale_color_gradientn(colors = myheatcolors3, limit=limit) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank(),
axis.title.y = element_blank())+
annotate("rect", xmin = 0, xmax = 20.5, ymin = 0.25, ymax = 0.5,
alpha=1, fill="#21908CFF") +
annotate("rect", xmin = 20.5, xmax = 40, ymin = 0.25, ymax = 0.5,
alpha=1, fill="#472D7BFF")
bubbleplot_chr25 + ggplot2::coord_flip()#add a column that is 1 if pvalue is <0.05 and +FC, -1 if padj is <0.05 and -FC
deaf1DEGdata2 <- deaf1DEGdata %>% mutate(Venn = ifelse(pvalue < 0.01 & log2FoldChange > 0, 1, ifelse(pvalue < 0.01 & log2FoldChange < 0, 1, 0)))
#filter to only include homvwt comparisons
deaf1DEGdata2 <- deaf1DEGdata2 %>% dplyr::filter(comparison %in% c("d23bp_homvwt", "dc207y_homvwt", "dt238p_homvwt", "d23bp_homvwt_6dpf"))
#make a new table with only gene name, comparison name, and Venn column
venntable <- deaf1DEGdata2 %>% dplyr::select(LLgeneAbbrev, comparison, Venn)
#find duplicates in venntable
duplicategenes <- dplyr::filter(venntable %>%
distinct() %>%
group_by(LLgeneAbbrev, comparison) %>%
dplyr::count(), n !=1)$LLgeneAbbrev
#remove duplicate rows from venntable
venntable <- venntable %>% dplyr::filter(!LLgeneAbbrev %in% duplicategenes)
#use pivot wider to make untidy table
venntable <- as.matrix(venntable %>% distinct() %>%
pivot_wider(names_from = comparison, values_from = Venn, values_fill=0) %>%
column_to_rownames('LLgeneAbbrev'))
#convert NULL to 0
venntable[venntable=='NULL'] <- 0
#find sum of values of rows
rowsum <- rowSums(venntable)
venntable <- cbind(venntable, rowsum)
#convert to dataframe
venntable <- as.data.frame(venntable)
#sort by those with highest sum of absolute values of rows
venntable <- venntable %>% arrange(desc(abs(rowsum)))
#filter by those with ==1 sig gene
venntable <- venntable %>% dplyr::filter(rowsum == 1)
#filter by those where c207 is 1
venntable <- venntable %>% dplyr::filter(dc207y_homvwt == 1)
#filter by those not on chr25
venntable_nochr25 <- setdiff(rownames(venntable), chr25genes)
#get all data for list of genes
c207list <- deaf1DEGdata %>% dplyr::filter(comparison == "dc207y_homvwt" & LLgeneAbbrev %in% venntable_nochr25)
#sort by p value
c207list <- c207list %>% arrange(pvalue)
c207list$LLgeneAbbrev## [1] "e2f4" "shisal1b" "rcvrnb"
## [4] "hps4" "prox1b" "abtb2b"
## [7] "zgc:153219" "LOC110440168" "brpf1"
## [10] "dop1a" "znf1089" "urp2"
## [13] "acmsd" "col28a1a" "rhcga"
## [16] "vip" "fah" "s1pr3a"
## [19] "ttll6" "kcnv2b" "phkg2"
## [22] "si:ch211-219a15.3" "hhat" "CABZ01114105.1"
## [25] "XLOC_003689" "kmt5ab" "slco1e1"
## [28] "cldn20" "creb1a" "wnt11f2"
## [31] "XLOC_028644" "FQ311928.1" "armc4"
## [34] "cacna2d3" "adcy7" "CU467646.6"
## [37] "si:dkey-93h22.8" "efna5a" "inpp5d"
## [40] "zgc:103564" "fbxl3b" "spsb4b"
## [43] "taco1" "fam20cb" "LOC101883486"
## [46] "fam219ab" "XLOC_039632" "crygm2e"
## [49] "zswim6"
#comparisons to include in plot
comparisons <- c("dt238p_homvwt", "dc207y_homvwt", "d23bp_homvwt_6dpf", "d23bp_homvwt")
c207specificgenes <- c(sort(c("rcvrnb", "prox1b", "brpf1", "dop1a", "efna5a", "fbxl3b", "spsb4b")), sort(c("hps4", "zgc:153219", "LOC110440168","znf1089", "urp2", "vip", "fah", "s1pr3a", "ttll6", "slco1e1", "armc4", "cacna2d3")))
#filter to only include genes of interest
bubbleDEG <- deaf1DEGdata %>% dplyr::filter(LLgeneAbbrev %in% c207specificgenes & comparison %in% comparisons)
#find middle of foldchange (to center color scale)
limit <- max(abs(bubbleDEG$log2FoldChange)) * c(-1, 1)
#order genes
bubbleDEG$LLgeneAbbrev <- factor(bubbleDEG$LLgeneAbbrev, levels = c207specificgenes)
#order comparisons
bubbleDEG$comparison <- factor(bubbleDEG$comparison, levels = c("d23bp_homvwt_6dpf", "d23bp_homvwt", "dt238p_homvwt", "dc207y_homvwt"))
#rename comparisons
levels(bubbleDEG$comparison) <- c("23d46i 6dpf", "23d46i 2dpf", "t238p 2dpf", "c207y 2dpf")
# create 'bubble plot' to summarize y signatures across x phenotypes
bubbleplot_c207y <- ggplot(bubbleDEG, aes(x=LLgeneAbbrev, y=comparison)) +
geom_point(aes(size=-log10(padj), color = log2FoldChange)) +
scale_color_gradientn(colors = myheatcolors3, limit=limit) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank(),
axis.title.y = element_blank())+
annotate("rect", xmin = 0, xmax = 7.5, ymin = 0.25, ymax = 0.5,
alpha=1, fill="#472D7BFF") +
annotate("rect", xmin = 7.5, xmax = 20, ymin = 0.25, ymax = 0.5,
alpha=1, fill="#21908CFF")
bubbleplot_c207y#add a column that is 1 if pvalue is <0.05 and +FC, -1 if padj is <0.05 and -FC
deaf1DEGdata2 <- deaf1DEGdata %>% mutate(Venn = ifelse(pvalue < 0.01 & log2FoldChange > 0, 1, ifelse(pvalue < 0.01 & log2FoldChange < 0, 1, 0)))
#filter to only include homvwt comparisons
deaf1DEGdata2 <- deaf1DEGdata2 %>% dplyr::filter(comparison %in% c("d23bp_homvwt", "d23bp_homvwt_6dpf", "dc207y_homvwt", "dt238p_homvwt"))
#make a new table with only gene name, comparison name, and Venn column
venntable <- deaf1DEGdata2 %>% dplyr::select(LLgeneAbbrev, comparison, Venn)
#find duplicates in venntable
duplicategenes <- dplyr::filter(venntable %>%
distinct() %>%
group_by(LLgeneAbbrev, comparison) %>%
dplyr::count(), n !=1)$LLgeneAbbrev
#remove duplicate rows from venntable
venntable <- venntable %>% dplyr::filter(!LLgeneAbbrev %in% duplicategenes)
#use pivot wider to make untidy table
venntable <- as.matrix(venntable %>% distinct() %>%
pivot_wider(names_from = comparison, values_from = Venn, values_fill=0) %>%
column_to_rownames('LLgeneAbbrev'))
#convert NULL to 0
venntable[venntable=='NULL'] <- 0
#find sum of values of rows
rowsum <- rowSums(venntable)
venntable <- cbind(venntable, rowsum)
#convert to dataframe
venntable <- as.data.frame(venntable)
#sort by those with highest sum of absolute values of rows
venntable <- venntable %>% arrange(desc(abs(rowsum)))
#filter by those with >1 significant gene
venntable <- venntable %>% dplyr::filter(rowsum >= 2)
#filter by those where 6dpf isn't different
venntable <- venntable %>% dplyr::filter(d23bp_homvwt_6dpf == 0)
#filter by those not on chr25
venntable_nochr25 <- setdiff(rownames(venntable), chr25genes)
#get all data for 6dpf genes
age23bp <- deaf1DEGdata %>% dplyr::filter(comparison == "d23bp_homvwt" & LLgeneAbbrev %in% venntable_nochr25)
#sort by pvalue
age23bp <- age23bp %>% arrange(pvalue)
age23bp$LLgeneAbbrev## [1] "CABZ01039863.1" "cav1" "mkrn1" "evlb"
## [5] "hdac10" "CABZ01039859.1" "apc2" "adarb2"
## [9] "slc25a22a" "vldlr" "col12a1a" "rapgef1a"
## [13] "dkk3b" "chordc1b" "draxin" "igf2bp2a"
## [17] "dusp6" "zgc:158689" "snx27a" "NC_002333.17"
## [21] "schip1" "samd11" "si:ch211-102c2.7" "tgfbr1b"
## [25] "rab3ab" "prkx" "pank1b" "pdgfra"
## [29] "elovl7b" "aff4" "casp8ap2" "zgc:162255"
## [33] "si:ch211-276c2.4" "pbx4" "si:ch211-239f4.1" "elovl4a"
## [37] "CACFD1" "SIPA1" "ganab" "rac3a"
## [41] "CC2D1A" "elovl2" "LOC108182329" "hipk2"
## [45] "ano9b" "LOC110438587" "pde5ab" "abi3b"
## [49] "CABZ01092170.1" "gas7a" "ctcf" "tnfaip8l2b"
## [53] "fbln2" "kdm6bb" "rab11fip4a" "zgc:110425"
## [57] "map6b" "CABZ01004876.1" "tfe3a" "msrb1a"
## [61] "mapk3" "ube2al" "rsf1a" "vac14"
## [65] "polr3b" "ldb2a" "LOC100537384" "rnasekb"
## [69] "hs3st4" "cenpf" "gdpd4a" "add1"
## [73] "sacm1la" "exoc5" "gjd1a" "rnf126"
## [77] "kif5c" "kitlga" "CABZ01029822.1" "myof"
## [81] "pax10" "prdm1b" "pdia3" "itpr1a"
## [85] "si:dkeyp-120h9.1" "impa2" "deaf1"
#comparisons to include in plot
comparisons <- c("dt238p_homvwt", "dc207y_homvwt", "d23bp_homvwt_6dpf", "d23bp_homvwt")
#make list of age-specific genes to include
agespecificgenes <- c(sort(c("CABZ01039859.1", "CABZ01092170.1", "evlb", "hdac10", "pdia3", "polr3b")), sort(c("CABZ01039863.1", "chordc1b", "dkk3b", "draxin", "dusp6", "elovl2", "exoc5", "itpr1a", "mkrn1", "msrb1a", 'pbx4', "prkx", "rapgef1a", "ube2al")))
#filter to only include genes of interest
bubbleDEG <- deaf1DEGdata %>% dplyr::filter(LLgeneAbbrev %in% agespecificgenes & comparison %in% comparisons)
#find middle of foldchange (to center color scale)
limit <- max(abs(bubbleDEG$log2FoldChange)) * c(-1, 1)
#order genes
bubbleDEG$LLgeneAbbrev <- factor(bubbleDEG$LLgeneAbbrev, levels = agespecificgenes)
#order comparisons
bubbleDEG$comparison <- factor(bubbleDEG$comparison, levels = c("d23bp_homvwt_6dpf", "d23bp_homvwt", "dt238p_homvwt", "dc207y_homvwt"))
#rename comparisons
levels(bubbleDEG$comparison) <- c("23d46i 6dpf", "23d46i 2dpf", "t238p 2dpf", "c207y 2dpf")
# create 'bubble plot' to summarize y signatures across x phenotypes
bubbleplot_ages <- ggplot(bubbleDEG, aes(x=LLgeneAbbrev, y=comparison)) +
geom_point(aes(size=-log10(padj), color = log2FoldChange)) +
scale_color_gradientn(colors = myheatcolors3, limit=limit) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank(),
axis.title.y = element_blank())+
annotate("rect", xmin = 0, xmax = 6.5, ymin = 0.25, ymax = 0.5,
alpha=1, fill="#472D7BFF") +
annotate("rect", xmin = 6.5, xmax = 21, ymin = 0.25, ymax = 0.5,
alpha=1, fill="#21908CFF")
bubbleplot_ages#we want the zebrafish signatures
dr_gsea <- msigdbr(species = "Danio rerio") #gets all collections/signatures with zebrafish
#look at the categories and subcategories of signatures available
dr_gsea %>%
dplyr::distinct(gs_cat, gs_subcat) %>%
dplyr::arrange(gs_cat, gs_subcat)# choose a specific msigdb collection/subcollection
dr_gsea_c5 <- msigdbr(species = "Danio rerio", category = "C5") %>% # choose msigdb collection of interest
dplyr::select(gs_name, gene_symbol) #just get the columns corresponding to signature name and gene symbols of genes in each signature
# pull out data for dc207y
GSEAgenes <- deaf1DEGdata %>% dplyr::filter(comparison == "dc207y_homvwt")
GSEAgenes <- GSEAgenes %>% dplyr::filter(! LLgeneAbbrev %in% chr25genes)
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#get rid of duplicates
mydata.df.sub <- mydata.df.sub %>% dplyr::filter(LLgeneAbbrev %in% mydata.df.sub$LLgeneAbbrev[!duplicated(mydata.df.sub$LLgeneAbbrev)])
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA with C5 in c207y
myGSEA.res.c207y <- GSEA(mydata.gsea, TERM2GENE=dr_gsea_c5, verbose=FALSE, seed=TRUE, pvalueCutoff = 1)
# pull out data for 23bp
GSEAgenes <- deaf1DEGdata %>% dplyr::filter(comparison == "d23bp_homvwt")
GSEAgenes <- GSEAgenes %>% dplyr::filter(! LLgeneAbbrev %in% chr25genes)
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#get rid of duplicates
mydata.df.sub <- mydata.df.sub %>% dplyr::filter(LLgeneAbbrev %in% mydata.df.sub$LLgeneAbbrev[!duplicated(mydata.df.sub$LLgeneAbbrev)])
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA with C5 in 23bp
myGSEA.res.23bp <- GSEA(mydata.gsea, TERM2GENE=dr_gsea_c5, verbose=FALSE, seed=TRUE, pvalueCutoff = 1)
# pull out data for t238p
GSEAgenes <- deaf1DEGdata %>% dplyr::filter(comparison == "dt238p_homvwt")
GSEAgenes <- GSEAgenes %>% dplyr::filter(! LLgeneAbbrev %in% chr25genes)
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#get rid of duplicates
mydata.df.sub <- mydata.df.sub %>% dplyr::filter(LLgeneAbbrev %in% mydata.df.sub$LLgeneAbbrev[!duplicated(mydata.df.sub$LLgeneAbbrev)])
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA with C5 in t238p
myGSEA.res.t238p <- GSEA(mydata.gsea, TERM2GENE=dr_gsea_c5, verbose=FALSE, seed=TRUE)
#convert to data frame add column about mutations
myGSEA.df.t238p <- as_tibble(myGSEA.res.t238p@result)
myGSEA.df.c207y <- as_tibble(myGSEA.res.c207y@result)
myGSEA.df.23bp <- as_tibble(myGSEA.res.23bp@result)
myGSEA.df.t238p$mutation <- "t238p"
myGSEA.df.c207y$mutation <- "c207y"
myGSEA.df.23bp$mutation <- "23d46i"
#combine the GSEA data into one data frame
myGSEA2dpf <- rbind(myGSEA.df.t238p, myGSEA.df.c207y, myGSEA.df.23bp)
#export myGSEA2dpf
#write.csv(myGSEA2dpf, file="deaf1_2dpf_C5_GSEA.csv")
#import GSEA
myGSEA2dpf <- read.csv(file="deaf1_2dpf_C5_GSEA.csv")[,2:13]
#look at top terms
datatable(myGSEA2dpf,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)#list of eye-related terms
eyeterms <- c("GOBP_DETECTION_OF_LIGHT_STIMULUS", "HP_COLOR_VISION_DEFECT", "GOCC_PHOTORECEPTOR_INNER_SEGMENT", "HP_UNDETECTABLE_ELECTRORETINOGRAM", "GOBP_PHOTORECEPTOR_CELL_DIFFERENTIATION", "GOBP_RETINA_DEVELOPMENT_IN_CAMERA_TYPE_EYE", "GOBP_SENSORY_PERCEPTION_OF_LIGHT_STIMULUS", "GOBP_PHOTORECEPTOR_CELL_DEVELOPMENT", "HP_ABNORMAL_VISUAL_ELECTROPHYSIOLOGY", "GOBP_RETINA_MORPHOGENESIS_IN_CAMERA_TYPE_EYE", "GOBP_DETECTION_OF_LIGHT_STIMULUS_INVOLVED_IN_SENSORY_PERCEPTION", "HP_COLOR_VISION_DEFECT")
#filter to include only eye terms
myGSEA2dpf <- myGSEA2dpf %>% dplyr::filter(ID %in% eyeterms)
# create bubble plot of shared pathways
eyetermsbubble <- ggplot(myGSEA2dpf, aes(x=mutation, y=ID)) +
geom_point(aes(size=setSize, color = NES, alpha=-log10(p.adjust))) +
scale_color_gradientn(colors = myheatcolors3, limit=c(-3,3)) +
theme_bw() +
theme(axis.title.x=element_blank(), axis.title.y=element_blank())
eyetermsbubble#export 900x500
#make plot of c207y visual
gseaplot2(myGSEA.res.c207y,
geneSetID = c(91, 1640, 673), #can choose multiple signatures to overlay in this plot
pvalue_table = FALSE)#export 800x500
#make plot of t238p visual
gseaplot2(myGSEA.res.t238p,
geneSetID = c(19, 20, 15), #can choose multiple signatures to overlay in this plot
pvalue_table = FALSE)# pull out data for 23bp 6dpf
GSEAgenes <- deaf1DEGdata %>% dplyr::filter(comparison == "d23bp_homvwt_6dpf")
GSEAgenes <- GSEAgenes %>% dplyr::filter(! LLgeneAbbrev %in% chr25genes)
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#get rid of duplicates
mydata.df.sub <- mydata.df.sub %>% dplyr::filter(LLgeneAbbrev %in% mydata.df.sub$LLgeneAbbrev[!duplicated(mydata.df.sub$LLgeneAbbrev)])
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA with C5 in 23bp 6dpf
myGSEA.res.23bp6dpf <- GSEA(mydata.gsea, TERM2GENE=dr_gsea_c5, verbose=FALSE, seed=TRUE, pvalueCutoff = 1)
#convert to DF
myGSEA.df.23bp.6dpf <- as_tibble(myGSEA.res.23bp6dpf@result)
#export
#write.csv(myGSEA.res.23bp6dpf, file="GSEA6dpf.csv")
#import
myGSEA.df.23bp.6dpf <- read.csv("GSEA6dpf.csv")[,2:12]
#look at top terms
datatable(myGSEA.df.23bp.6dpf,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)#make list of terms to include in heat map
C5GSEAterms <- c("GOCC_NEURON_TO_NEURON_SYNAPSE", "GOCC_GLUTAMATERGIC_SYNAPSE", "GOCC_POSTSYNAPTIC_MEMBRANE", "GOCC_SYNAPTIC_MEMBRANE", "GOBP_POSITIVE_REGULATION_OF_INTERLEUKIN_1_PRODUCTION", "GOBP_RESPONSE_TO_CHEMOKINE", "GOBP_POSITIVE_REGULATION_OF_CYTOKINE_PRODUCTION")
#make heatmap
p3 <- heatplot(myGSEA.res.23bp6dpf, foldChange=mydata.gsea) + ggplot2::coord_flip()
#get data of out heatmap
p3 <- ggplot_build(p3)
heatmapdata <- p3$plot$data
#filter to only include results for subtypes of interest
heatmapdata <- heatmapdata %>% dplyr::filter(categoryID %in% C5GSEAterms)
#filter to only include fold changes > 0.5
genestoinclude <- heatmapdata %>% dplyr::filter(foldChange > 0.75 | foldChange < -0.75)
heatmapdata <- heatmapdata %>% dplyr::filter(Gene %in% genestoinclude$Gene)
#use pivot wider to make untidy table
heatmapdata <- heatmapdata %>% pivot_wider(names_from = categoryID, values_from = foldChange, values_fill=NA)
#turn heatmap data back into tidy table
heatmapdata <- heatmapdata %>% pivot_longer(!Gene, names_to = "categoryID", values_to = "foldChange")
#change order of data to be plotted
heatmapdata$categoryID <- factor(heatmapdata$categoryID, levels = rev(c("GOCC_POSTSYNAPTIC_MEMBRANE", "GOCC_SYNAPTIC_MEMBRANE", "GOCC_NEURON_TO_NEURON_SYNAPSE", "GOCC_GLUTAMATERGIC_SYNAPSE", "GOBP_POSITIVE_REGULATION_OF_INTERLEUKIN_1_PRODUCTION", "GOBP_RESPONSE_TO_CHEMOKINE", "GOBP_POSITIVE_REGULATION_OF_CYTOKINE_PRODUCTION" )))
#rename dataframe columns
colnames(heatmapdata) <- c("Gene", "categoryID", "log2foldchange")
C5_6dpf_heatmap <- ggplot(heatmapdata, aes(x = Gene, y = categoryID, fill = log2foldchange)) +
geom_tile(color="black") +
theme_classic() +
scale_fill_continuous_divergingx(palette = 'RdBu', mid = 0, na.value="white") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), axis.title.x=element_blank(), axis.title.y=element_blank())
C5_6dpf_heatmapThis uses the zebrafish anatomy ontology and expression information to make lists of genes that are expressed within each anatomy term.
#make a vector of the relationship types included in the ZFA obo file
relationshipnames <- get_relation_names("zfaontology.OBO.txt")
#read in OBO file
zfaontology <- get_ontology("zfaontology.OBO.txt", propagate_relationships=relationshipnames, extract_tags = "everything")
#read in gaf file
zfagaf<-readr::read_tsv("zfagaf.gaf") #import gaf
zfagaf <- zfagaf[37:235090,c(2:6)] #remove the header and unnecessary columns
colnames(zfagaf) <- c("zfingeneid", "genename", "relationship", "zfa", "evidence")
#make empty list
genezfalist <- list()
#use a loop to find all the ZFA terms associated with each gene and put them in a list
for (x in 1:length(zfagaf$genename)) {
#get the first gene
rowdata <- zfagaf[x,]
gene <- rowdata$genename #name of gene
zfaterm <- rowdata$zfa #name of ZFA term associated with gene
ancestors <- get_ancestors(zfaontology, zfaterm) #all ancestors of term
ancestorsgene <- as.data.frame(matrix(data=NA, nrow=length(ancestors), ncol=0)) #empty data frame
ancestorsgene$genename <- gene #add gene name to data frame
ancestorsgene$ZFAterm <- ancestors #add parents of ZFA term to data frame
genezfalist[[x]] <- ancestorsgene #put data frame in a list
}
#combine all the data frames from list into one data frame
genezfaassociation <- do.call(rbind, genezfalist)
#remove duplicate rows
genezfaassociation <- genezfaassociation %>% distinct()
#export file
write.csv(genezfaassociation, file="genezfaassociation.csv")
#pivot wider to make ZFA terms a column and each gene a row
pivot <- genezfaassociation <- genezfaassociation %>%
pivot_wider(
names_from = genename,
values_from = genename
)
pivot2 <- pivot
#add name of zfa ID to data frame
zfaontologid <- as.data.frame(zfaontology$name) #get ZFA ontology/id pairs
zfaontologid$id <- rownames(zfaontologid) #make column called id containing ZFA id
pivot2$description <- NA #add a column for description containing NAs
#add descriptions for ZFA IDs
for (x in 1:length(pivot2$description)) {
firstterm <- as.character(pivot2[x,1]) #get ZFA term out of pivot
filteredzfa <- zfaontologid %>% dplyr::filter(id == firstterm)
pivot2$description[x] <- filteredzfa$`zfaontology$name`
}
#rearrange table
pivot2 <- cbind(pivot2$ZFAterm, pivot2$description, pivot2[,2:14213])
#shift to the left
pivot3 = as.data.frame(t(apply(pivot2,1, function(x) { return(c(x[!is.na(x)],x[is.na(x)]) )} )))
colnames(pivot3) = colnames(pivot2)
#get rid of ZFA terms that only have one or two genes (zfa terms must have three genes)
pivot4 <- pivot3
pivot4 <- pivot4 %>% drop_na(tbxta)
pivot4 <- pivot4 %>% drop_na(bglap)
pivot4[is.na(pivot4)] <- ""
colnames(pivot4)[1] <- "zfaterm"
#export gmt with at least 3 genes/zfa term
write.table(pivot4, file='zfa-geneexp-min.gmt', quote=FALSE, sep='\t', row.names =FALSE)
#pivot longer to make tidy for GSEA
pivot5 <- pivot4[,2:14214]
colnames(pivot5)[1] <- "zfaterm"
pivot5 <- pivot5 %>%
pivot_longer(
!zfaterm,
names_to = "extracolumn",
values_to = "genename"
)
pivot5 <- pivot5[,c(1,3)]
fullsetGSEA <- pivot5[!(is.na(pivot5$genename) | pivot5$genename==""), ]
#filter to only include descendants of head
headterms <- get_descendants(zfaontology, "ZFA:0001114") #get descendants of head
pivot5 <- pivot4 %>% dplyr::filter (zfaterm %in% headterms)
pivot5 <- pivot5[,2:14214]
colnames(pivot5)[1] <- "zfaterm"
pivot5 <- pivot5 %>%
pivot_longer(
!zfaterm,
names_to = "extracolumn",
values_to = "genename"
)
pivot5 <- pivot5[,c(1,3)]
headtermsGSEA <- pivot5[!(is.na(pivot5$genename) | pivot5$genename==""), ]
#get descendants of CNS
CNSterms <- get_descendants(zfaontology, "ZFA:0000012")
pivot5 <- pivot4 %>% dplyr::filter (zfaterm %in% CNSterms)
pivot5 <- pivot5[,2:14214]
colnames(pivot5)[1] <- "zfaterm"
pivot5 <- pivot5 %>%
pivot_longer(
!zfaterm,
names_to = "extracolumn",
values_to = "genename"
)
pivot5 <- pivot5[,c(1,3)]
CNStermsGSEA <- pivot5[!(is.na(pivot5$genename) | pivot5$genename==""), ]
#change NA to empty string for full gmt
pivot3[is.na(pivot3)] <- ""
#export full gmt
write.table(pivot3, file='zfa-geneexp.gmt', quote=FALSE, sep='\t', row.names =FALSE)
#token = upload_GMT_file(gmtfile = "zfa-geneexp-min.gmt")
#the token to use the gmt is "gp__3Toj_Qd5k_dG4"# Pull out just the columns corresponding to gene symbols and LogFC for at least one pairwise comparison for the enrichment analysis
GSEAgenes <- deaf1DEGdata %>% dplyr::filter(comparison == "d23bp_homvwt_6dpf")
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#get rid of duplicates
mydata.df.sub <- mydata.df.sub %>% dplyr::filter(LLgeneAbbrev %in% mydata.df.sub$LLgeneAbbrev[!duplicated(mydata.df.sub$LLgeneAbbrev)])
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
#options for doing GSEA using ZFA
#CNStermsGSEA
#headtermsGSEA
#fullsetGSEA
# run GSEA with CNS terms in 23bp
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=CNStermsGSEA, verbose=FALSE, seed=TRUE, minGSSize = 80)
myGSEA.df <- as_tibble(myGSEA.res@result)
#export file
#write.csv(myGSEA.df, file="functionalanalysis/6dpf_CNSterms_GSEA.csv")
#import file
myGSEA.df <- read.csv(file="functionalanalysis/6dpf_CNSterms_GSEA.csv")[,2:12]
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)#make network plot
myGSEA.res <- pairwise_termsim(myGSEA.res)
deaf1network <- emapplot(myGSEA.res, color="NES", categorySize="p.adjust")
#get data of out network plot
deaf1network <- ggplot_build(deaf1network)
networkdata <- deaf1network$plot$data
deaf1CNSterms_network <- ggraph(networkdata) +
geom_edge_link(alpha=.8, aes_(width=~I(width)), colour='darkgrey') +
geom_node_point(aes(colour = color, size=size)) +
geom_node_text(aes(label=name), repel=TRUE) +
theme_void() +
scale_color_gradientn(colors = myheatcolors3, limit=c(-2.5,2.5), name="NES")
deaf1CNSterms_network#export 700x500
#make heatmap
p3 <- heatplot(myGSEA.res, foldChange=mydata.gsea) + ggplot2::coord_flip()
#get data of out heatmap
p3 <- ggplot_build(p3)
heatmapdata <- p3$plot$data
#filter to only include fold changes > 0.5
genestoinclude <- heatmapdata %>% dplyr::filter(foldChange > 0.5 | foldChange < -0.5)
heatmapdata <- heatmapdata %>% dplyr::filter(Gene %in% genestoinclude$Gene)
#use pivot wider to make untidy table
heatmapdata <- heatmapdata %>% pivot_wider(names_from = categoryID, values_from = foldChange, values_fill=NA)
#make matrix
heatmapmatrix <- as.matrix(heatmapdata[,2:12])
rownames(heatmapmatrix) <- heatmapdata$Gene
#turn matrix into 1s and 0s and cluster
heatmapmatrix<-ifelse(abs(heatmapmatrix) > 0,1,0)
heatmapmatrix[is.na(heatmapmatrix)] = 0
#cluster rows by pearson correlation
hc <- hclust(as.dist(1-cor(heatmapmatrix, method="spearman")), method="average") #cluster columns by spearman correlation
#turn heatmap data back into tidy table
heatmapdata <- heatmapdata %>% pivot_longer(!Gene, names_to = "categoryID", values_to = "foldChange")
#change order of data to be plotted
heatmapdata$categoryID <- factor(heatmapdata$categoryID, levels = hc$labels)
#rename dataframe columns
colnames(heatmapdata) <- c("Gene", "categoryID", "log2foldchange")
zfaCNS_heatmap <- ggplot(heatmapdata, aes(x = Gene, y = categoryID, fill = log2foldchange)) +
geom_tile(color="black") +
theme_classic() +
scale_fill_continuous_divergingx(palette = 'RdBu', mid = 0, na.value="white") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), axis.title.x=element_blank(), axis.title.y=element_blank())
zfaCNS_heatmap#export 1000x400
# run GSEA with head terms in 23bp
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=headtermsGSEA, verbose=FALSE, seed=TRUE, minGSSize = 80)
myGSEA.df <- as_tibble(myGSEA.res@result)
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)#make network plot
myGSEA.res <- pairwise_termsim(myGSEA.res)
deaf1network <- emapplot(myGSEA.res, color="NES", categorySize="p.adjust")
#get data of out heatmap
deaf1network <- ggplot_build(deaf1network)
networkdata <- deaf1network$plot$data
deaf1headterms_network <- ggraph(networkdata) +
geom_edge_link(alpha=.8, aes_(width=~I(width)), colour='darkgrey') +
geom_node_point(aes(colour = color, size=size)) +
geom_node_text(aes(label=name), repel=TRUE) +
theme_void() +
scale_color_gradientn(colors = myheatcolors3, limit=c(-2.5,2.5), name = "NES")
deaf1headterms_network#import single cell markers
singlecellmarkers <- read.csv("zf5dpf_markersforGSEA.csv")
#select relevant columns
singlecellmarkers <- singlecellmarkers %>% dplyr::select(cluster.description, gene)
# Pull out just the columns corresponding to gene symbols and LogFC
GSEAgenes <- deaf1DEGdata %>% dplyr::filter(comparison == "d23bp_homvwt_6dpf")
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#get rid of duplicates
mydata.df.sub <- mydata.df.sub %>% dplyr::filter(LLgeneAbbrev %in% mydata.df.sub$LLgeneAbbrev[!duplicated(mydata.df.sub$LLgeneAbbrev)])
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA with single cell markers in 6dpf 23bp
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=singlecellmarkers, verbose=FALSE, seed=TRUE, minGSSize = 80)
myGSEA.df <- as_tibble(myGSEA.res@result)
#export
#write.csv(myGSEA.df, file="deaf1-singlecellGSEA.csv")
#import
myGSEA.df <- read.csv(file="deaf1-singlecellGSEA.csv")[,2:12]
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)#make network plot
myGSEA.res <- pairwise_termsim(myGSEA.res)
deaf1network <- emapplot(myGSEA.res, color="NES", categorySize="p.adjust", showCategory = 57)
#get data of out network plot
deaf1network <- ggplot_build(deaf1network)
networkdata <- deaf1network$plot$data
#make network plot
deaf1singlecell_network <- ggraph(networkdata) +
geom_edge_link(alpha=.8, aes_(width=~I(width)), colour='darkgrey') +
geom_node_point(aes(colour = color, size=size)) +
geom_node_text(aes(label=name), repel=TRUE) +
theme_void() +
scale_color_gradientn(colors = myheatcolors3, limit=c(-3.5,3.5), name="NES")
deaf1singlecell_network#export 700x500
#repeat 6dpf GSEA but with neuronal clusters only
nonneuronalclusters <- c("#N/A", "cartilage #2", "cartilage #1", "cartilage #3", "cornea", "cranial ganglion", "cranial ganglion (vagal) #1", "cranial ganglion (vagal) #2", "dermal bone", "epidermis (progenitors)", "epithelium (cornea, progenitors)", "erythrocytes", "eye (cornea), cartilage", "eye (cornea, epithelium)", "eye (non retina)", "glia (eye)", "lens #1", "lens #2", "muscle", "neural crest derived (pigment/xanthophore)", "neutrophils", "otic vesicle", "otic/lateral line", "pigment cell (iridophore)", "retina (amacrine, gabaergic)", "retina (cone bipolar cells)", "retina (cones)", "retina (muller glia)", "retina (photoreceptor precursor cells)", "retina (RGC)", "retina (rods)", "retina (RPE differentiation)", "retina (RPE)", "retina neurons (horizontal?)", "rostral blood island (myeloid) #1", "rostral blood island (myeloid) #2", "pharangeal arch/pectoral fin (mesoderm)")
#make GSEA set for CNS single cell data
singlecellmarkers_CNS <- singlecellmarkers %>% dplyr::filter(! cluster.description %in% nonneuronalclusters)
# run GSEA with single cell markers in 6dpf 23bp
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=singlecellmarkers_CNS, verbose=FALSE, seed=TRUE, minGSSize = 80)
myGSEA.df <- as_tibble(myGSEA.res@result)
#export
#write.csv(myGSEA.df, file="deaf1-singlecellGSEA-CNS.csv")
#import
myGSEA.df <- read.csv(file="deaf1-singlecellGSEA-CNS.csv")[,2:12]
#view data table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)#make network plot
myGSEA.res <- pairwise_termsim(myGSEA.res)
deaf1network <- emapplot(myGSEA.res, color="NES", categorySize="p.adjust", showCategory = 33)
#get data of out network plot
deaf1network <- ggplot_build(deaf1network)
networkdata <- deaf1network$plot$data
#make new network plot
deaf1singlecellCNS_network <- ggraph(networkdata) +
geom_edge_link(alpha=.8, aes_(width=~I(width)), colour='darkgrey') +
geom_node_point(aes(colour = color, size=size)) +
geom_node_text(aes(label=name), repel=TRUE) +
theme_void() +
scale_color_gradientn(colors = myheatcolors3, limit=c(-3,3), name="NES")
deaf1singlecellCNS_network#export 700x500
#list of neuronal subtypes to graph
neuronalsubtypes <- c("dorsal diencephalon (thalamus) #1", "dorsal diencephalon (thalamus) #2", "diencephalon (tuberculum; gabaergic)", "optic tectum (gabaergic) #1", "mid-hind boundary (gabaergic) #1", "dorsal diencephalon (thalamus) #3", "mid-hind boundary (gabaergic) #2", "dorsal diencephalon (thalamus) #4", "diencephalon", "neurons (dienc, glutamatergic, ZLI)", "optic tectum (gabaergic) #2", "midbrain #2", "hypothalamus #1", "hypothalamus #5","neurons (midbrain?, glutamatergic)", "hypothalamus #6")
#make heatmap
p3 <- heatplot(myGSEA.res, foldChange=mydata.gsea) + ggplot2::coord_flip()
#get data of out heatmap
p3 <- ggplot_build(p3)
heatmapdata <- p3$plot$data
#filter to only include results for subtypes of interest
heatmapdata <- heatmapdata %>% dplyr::filter(categoryID %in% neuronalsubtypes)
#filter to only include fold changes > 0.5
genestoinclude <- heatmapdata %>% dplyr::filter(foldChange > 0.5 | foldChange < -0.5)
heatmapdata <- heatmapdata %>% dplyr::filter(Gene %in% genestoinclude$Gene)
#use pivot wider to make untidy table
heatmapdata <- heatmapdata %>% pivot_wider(names_from = categoryID, values_from = foldChange, values_fill=NA)
#make matrix
heatmapmatrix <- as.matrix(heatmapdata[,2:15])
rownames(heatmapmatrix) <- heatmapdata$Gene
#turn matrix into 1s and 0s and cluster
heatmapmatrix<-ifelse(abs(heatmapmatrix) > 0,1,0)
heatmapmatrix[is.na(heatmapmatrix)] = 0
#cluster your selected genes
hr <- hclust(as.dist(1-cor(t(heatmapmatrix), method="pearson")), method="complete") #cluster rows by pearson correlation
hc <- hclust(as.dist(1-cor(heatmapmatrix, method="spearman")), method="average") #cluster columns by spearman correlation
#turn heatmap data back into tidy table
heatmapdata <- heatmapdata %>% pivot_longer(!Gene, names_to = "categoryID", values_to = "foldChange")
#change order of data to be plotted
heatmapdata$categoryID <- factor(heatmapdata$categoryID, levels = hc$labels)
heatmapdata$Gene <- factor(heatmapdata$Gene, levels = hr$labels)
#rename dataframe columns
colnames(heatmapdata) <- c("Gene", "categoryID", "log2foldchange")
singlecellheatmap <- ggplot(heatmapdata, aes(x = Gene, y = categoryID, fill = log2foldchange)) +
geom_tile(color="black") +
theme_classic() +
scale_fill_continuous_divergingx(palette = 'RdBu', mid = 0, na.value="white") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), axis.title.x=element_blank(), axis.title.y=element_blank())
singlecellheatmapDEAF1 Chip-seq data was downloaded from this URL: https://www.encodeproject.org/files/ENCFF813TRY/ on 10/30/2023. Accession is ENCFF813TRY. Peaks are annotated to human gene names using ChIPSeeker. The goal is to identify
#import data
samplefiles <- list.files("deaf1bindingsites", pattern= ".bed", full.names=T)
samplefiles <- as.list(samplefiles)
names(samplefiles) <- c("DEAF1")
#import DEAF1 binding data
peak <- readPeakFile(samplefiles[[1]])
#import human gene annotation data
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
#retrieve annotations for peak calls: tss is 2000 downstream and upstream, flank distance for nearby genes is 250bp
peakAnnoList <- lapply(samplefiles, annotatePeak, TxDb=txdb,
tssRegion=c(-2000, 2000), verbose=FALSE, addFlankGeneInfo=TRUE, flankDistance = 250)
#make heat map of binding over promoters
promoter <- getPromoters(TxDb=txdb, upstream=2000, downstream=2000)
tagMatrix <- getTagMatrix(peak, windows=promoter)## >> preparing start_site regions by gene... 2023-11-15 5:20:38 PM
## >> preparing tag matrix... 2023-11-15 5:20:38 PM
#make graph of binding versus distance from TSS
plotAvgProf(tagMatrix, xlim=c(-2000, 2000),
xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency")## >> plotting figure... 2023-11-15 5:20:44 PM
#export 700x500
#This graph makes it seem like most DEAF1 binding at the promoter is between -500 and ~+500
#make bar plot of where DEAF1 is binding
plotAnnoBar(peakAnnoList)#export 700x500
#DEAF1 is primarily binding to promoters, within 1kb of TSS
#Distribution of TF-binding loci relative to TSS
plotDistToTSS(peakAnnoList, title="Distribution of transcription factor-binding loci \n relative to TSS")# save annotations for each peak call
deaf1_annot <- data.frame(peakAnnoList[["DEAF1"]]@anno)
#get entrez gene ids
entrez <- deaf1_annot$geneId
# Return the gene symbol for the set of Entrez IDs
annotations_edb <- AnnotationDbi::select(EnsDb.Hsapiens.v75,
keys = entrez,
columns = c("GENENAME"),
keytype = "ENTREZID")
# Change IDs to character type to merge
annotations_edb$ENTREZID <- as.character(annotations_edb$ENTREZID)
#add geneID annotations
deaf1_annot <- deaf1_annot %>%
left_join(annotations_edb, by=c("geneId"="ENTREZID"))
#export file with peak annotations
write.csv(deaf1_annot, file="deaf1_chipseq_geneannotation.csv")
#filter to only include DEAF1 binding at TSS (promoters)
deaf1_promoterpeaks <- deaf1_annot %>% dplyr::filter(annotation %in% c("Promoter (<=1kb)", "Promoter (1-2kb)"))
#keep only columns with gene names
deaf1_promoterpeaks <- deaf1_promoterpeaks %>% dplyr::select(geneId, flank_geneIds)
#split flank_txIds by semicolon
deaf1_promoterpeaks <- deaf1_promoterpeaks %>% separate_wider_delim(flank_geneIds, ";", names = c(paste("x", 1:15)), too_many = "drop", too_few = "align_start")
#turn df into vector
deaf1_promoterpeaks <- unlist(deaf1_promoterpeaks, use.names=FALSE)
#remove duplicate values
deaf1_promoterpeaks <- unique(deaf1_promoterpeaks)
#remove NA
deaf1_promoterpeaks <- na.omit(deaf1_promoterpeaks)
# Return the gene symbol for the set of Entrez IDs
annotations_promoter <- AnnotationDbi::select(EnsDb.Hsapiens.v75,
keys = deaf1_promoterpeaks,
columns = c("GENENAME"),
keytype = "ENTREZID")
#convert zebrafish genes to mouse genes(output is a dictionary)
mouse_zfish_dict <- orthogene::convert_orthologs(gene_df=annotations_promoter,
gene_input="GENENAME",
gene_output="dict",
input_species="human",
output_species="zebrafish",
non121_strategy="kbs",
method="gprofiler")
#change ENSDARG00000089860 to atxn7l1
mouse_zfish_dict["ATXN7L1"] <- "atxn7l1"
#get list of zebrafish genes with human CHIPSeq binding
deaf1_chipseq_genes <- unique(unname(mouse_zfish_dict))
#add EIF4G3a and EIF4G3b (EIF4G3 incorrectly converted from human)
deaf1_chipseq_genes <- c(deaf1_chipseq_genes, "eif4g3a", "eif4g3b")
#make GSEA set for deaf1 chipseq
deaf1chipseqGSEA <- as.data.frame(deaf1_chipseq_genes)
deaf1chipseqGSEA$term <- "DEAF1chipseq"
deaf1chipseqGSEA <- deaf1chipseqGSEA %>% dplyr::select(term, deaf1_chipseq_genes)List of zebrafish genes with DEAF1 motifs within 100 upstream of the TSS was acquired by using https://rsat.france-bioinformatique.fr/metazoa/genome-scale-dna-pattern_form.cgi. Organism was Danio rerio GRCz10. Feature type was gene. Sequence type was upstream between -100 and -1. Sequence label was gene name. Overlap with upstream ORFs was allowed. Search strand was both strands. Match counts was set to 1. Prevent overlapping matches was unchecked. Substitutions was 0. Query patterns were: TCGNNNNNTCG TCGNNNNNNTCG TCGNNNNNNNTCG TCGNNNNNNNNTCG TCGNNNNNNNNNTCG TCGNNNNNNNNNNTCG TCGNNNNNNNNNNNTCG
The server command was: $RSAT/perl-scripts/retrieve-seq -all -nocomment -org Danio_rerio_GRCz10 -feattype gene -type upstream -format fasta -label name -from -100 -to -1 | $RSAT/perl-scripts/dna-pattern -nolimits -v -format fasta -pl $RSAT/public_html/tmp/www-data/2023/11/15/gs-dna-pattern_2023-11-15.055318_5Nh2ac_patterns.txt -return counts -th 1 -subst 0 | $RSAT/perl-scripts/add-gene-info -info descr -org Danio_rerio_GRCz10 | $RSAT/perl-scripts/add-linenb dna-pattern -nolimits -v -format fasta -pl $RSAT/public_html/tmp/www-data/2023/11/15/gs-dna-pattern_2023-11-15.055318_5Nh2ac_patterns.txt -return counts -th 1 -subst 0 Citation: van Helden et al. (2000). Yeast 16(2), 177-187. Input format fasta Pattern file $RSAT/public_html/tmp/www-data/2023/11/15/gs-dna-pattern_2023-11-15.055318_5Nh2ac_patterns.txt Search method regexp Threshold 1 Allowed substitutions 0 Return fields counts Patterns seq id score TCGNNNNNTCG TCGNNNNNTCG 1 TCGNNNNNNTCG TCGNNNNNNTCG 1 TCGNNNNNNNTCG TCGNNNNNNNTCG 1 TCGNNNNNNNNTCG TCGNNNNNNNNTCG 1 TCGNNNNNNNNNTCG TCGNNNNNNNNNTCG 1 TCGNNNNNNNNNNTCG TCGNNNNNNNNNNTCG 1 TCGNNNNNNNNNNNTCG TCGNNNNNNNNNNNTCG 1
The resulting file is gs-dna-pattern_2023-11-15.055318_5Nh2ac.dnapat
#import genes with binding sites within 100 of TSS
rsatbinding <- read.table(file="deaf1bindingsites/gs-dna-pattern_2023-11-15.055318_5Nh2ac.dnapat.txt", fill=TRUE)
#make a list of the unique binding sites
rsatbinding <- unique(rsatbinding$V1)
#add deaf1 (has DEAF1 binding motif but TSS annotation is incorrect in assembly)
rsatbinding <- c(rsatbinding, "deaf1")
#make GSEA set for deaf1 binding sites
deaf1bindingGSEA <- as.data.frame(rsatbinding)
deaf1bindingGSEA$term <- "DEAF1motif"
deaf1bindingGSEA <- deaf1bindingGSEA %>% dplyr::select(term, rsatbinding)
#make a list of genes shared between zebrafish binding site motif (rsatbinding) and human DEAF1 chipseq
sharedgenes <- intersect(rsatbinding, deaf1_chipseq_genes)
#make a list of genes in rsatbinding but not in deaf1 chipseq
rsatbinding <- rsatbinding[!rsatbinding %in% sharedgenes]
#make a list of genes in deaf1 chipseq but not in rsatbinding
deaf1_chipseq_genes <- deaf1_chipseq_genes[!deaf1_chipseq_genes %in% sharedgenes]
#add new column to deaf1DEGdata with information about binding site
deaf1DEGdata <- deaf1DEGdata %>% mutate(bindingsite = ifelse(LLgeneAbbrev %in% rsatbinding, "motif near TSS", ifelse(LLgeneAbbrev %in% deaf1_chipseq_genes, "ChIP-Seq peak", ifelse(LLgeneAbbrev %in% sharedgenes, "both", "neither"))))
#filter comparisons to only include 23bp hom v wt comparisons
myTopHits.df <- deaf1DEGdata %>% dplyr::filter(comparison %in% c("d23bp_homvwt_6dpf"))
#add column about whether chr is chr25
myTopHits.df <- myTopHits.df %>% dplyr::mutate(chrom = replace_na(chrom, "none"))
myTopHits.df <- myTopHits.df %>% mutate(chromosome = ifelse(chrom == "chr25", "chromosome 25", "other chromosome"))
#change order that points are plotted
myTopHits.df <- myTopHits.df %>% arrange(match(bindingsite, c("neither", "ChIP-Seq peak", "motif near TSS", "both")), desc(bindingsite))
genestolabel <- c("sec22a", "creb1b", "nfyal", "atxn7l1", "galk2", "nfyal", "cdkn1bb", "ambra1b", "khk", "muc5.1", "znf319b", "ccl39.3", "LOC100330348", "map1lc3b", "ccl39.6", "ccl39.7", "usb1", "cdc27", "deaf1")
#subset to only include labeled genes
myTopHits.labels <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% genestolabel)
#make the plot
deaf1_6dpf_volcano <- ggplot(myTopHits.df) +
aes(y=-log10(padjbound), x=log2FoldChange, text = paste("Symbol:", LLgeneAbbrev), color=bindingsite, shape=chromosome) +
annotate("rect", xmin = 1, xmax = 7, ymin = -log10(0.05), ymax = 10,
alpha=.15, fill="grey") +
annotate("rect", xmin = -1, xmax = -7, ymin = -log10(0.05), ymax = 10,
alpha=.15, fill="grey") +
geom_point(size=2) +
theme_bw() +
coord_cartesian(xlim = c(-8, 8), ylim = c(-0.5, 10.5), expand = FALSE) +
ylab("-log10(padj)") +
xlab("log2 fold change") +
geom_label_repel(data=myTopHits.labels, aes(x=log2FoldChange, y=-log10(padjbound), label=LLgeneAbbrev), force = 2, nudge_y = -1, size = 2.5, max.overlaps = Inf, show.legend = FALSE, color = "black") + #label selected genes
theme_bw() +
scale_color_manual(values = c("#AADC32FF", "#27AD81FF", "#472D7BFF", "darkgrey"), name = "DEAF1 motif/ \n binding near TSS") +
scale_shape_manual(values=c(4, 16))
deaf1_6dpf_volcano#filter comparisons to only include 23bp hom v wt comparisons
myTopHits.df <- deaf1DEGdata %>% dplyr::filter(comparison %in% c("d23bp_homvwt"))
#add column about whether chr is chr25
myTopHits.df <- myTopHits.df %>% dplyr::mutate(chrom = replace_na(chrom, "none"))
myTopHits.df <- myTopHits.df %>% mutate(chromosome = ifelse(chrom == "chr25", "chromosome 25", "other chromosome"))
#change order that points are plotted
myTopHits.df <- myTopHits.df %>% arrange(match(bindingsite, c("neither", "ChIP-Seq peak", "motif near TSS", "both")), desc(bindingsite))
genestolabel <- c("khk", "muc5.1", "sec22a", "ambra1b", "znf319b", "ptpn9a", "saal1", "ptdss2", "muc5.2", "tmem138", "CABZ01039863.1", "LOC110438378", "cav1", "btbd10a", "snupn", "evlb", "mkrn1", "klhdc4", "erp44", "nfyal", "chordc1b")
#subset to only include labeled genes
myTopHits.labels <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% genestolabel)
#make the plot
deaf1_2dpf_23bp_volcano <- ggplot(myTopHits.df) +
aes(y=-log10(padjbound), x=log2FoldChange, text = paste("Symbol:", LLgeneAbbrev), color=bindingsite, shape=chromosome) +
annotate("rect", xmin = 1, xmax = 5, ymin = -log10(0.05), ymax = 10,
alpha=.15, fill="grey") +
annotate("rect", xmin = -1, xmax = -5, ymin = -log10(0.05), ymax = 10,
alpha=.15, fill="grey") +
geom_point(size=2) +
theme_bw() +
coord_cartesian(xlim = c(-6, 6), ylim = c(-0.5, 10.5), expand = FALSE) +
ylab("-log10(padj)") +
xlab("log2 fold change") +
geom_label_repel(data=myTopHits.labels, aes(x=log2FoldChange, y=-log10(padjbound), label=LLgeneAbbrev), force = 2, nudge_y = -1, size = 2.5, max.overlaps = Inf, show.legend = FALSE, color = "black") + #label selected genes
theme_bw() +
scale_color_manual(values = c("#AADC32FF", "#27AD81FF", "#472D7BFF", "darkgrey"), name = "DEAF1 motif/ \n binding near TSS") +
scale_shape_manual(values=c(4, 16))
deaf1_2dpf_23bp_volcano#filter comparisons to only include c207y hom v wt comparison
myTopHits.df <- deaf1DEGdata %>% dplyr::filter(comparison %in% c("dc207y_homvwt"))
#add column about whether chr is chr25
myTopHits.df <- myTopHits.df %>% dplyr::mutate(chrom = replace_na(chrom, "none"))
myTopHits.df <- myTopHits.df %>% mutate(chromosome = ifelse(chrom == "chr25", "chromosome 25", "other chromosome"))
#change order that points are plotted
myTopHits.df <- myTopHits.df %>% arrange(match(bindingsite, c("neither", "ChIP-Seq peak", "motif near TSS", "both")), desc(bindingsite))
genestolabel <- c("khk", "sec22a", "ptdss2", "nfyal", "snupn", "btbd10a", "cav1", "mkrn1", "CABZ01039863.1", "hdac10", "CABZ01039859.1", "atxn7l1", "ccnd2a", "zgc:77158", "dus2", "slc25a22a", "dkk3b", "cpt1ab", "deaf1", "dusp6", "setd6", "dusp8a")
#subset to only include labeled genes
myTopHits.labels <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% genestolabel)
#make the plot
deaf1_2dpf_c207y_volcano <- ggplot(myTopHits.df) +
aes(y=-log10(padjbound), x=log2FoldChange, text = paste("Symbol:", LLgeneAbbrev), color=bindingsite, shape=chromosome) +
annotate("rect", xmin = 1, xmax = 5, ymin = -log10(0.05), ymax = 10,
alpha=.15, fill="grey") +
annotate("rect", xmin = -1, xmax = -5, ymin = -log10(0.05), ymax = 10,
alpha=.15, fill="grey") +
geom_point(size=2) +
theme_bw() +
coord_cartesian(xlim = c(-6, 6), ylim = c(-0.5, 10.5), expand = FALSE) +
ylab("-log10(padj)") +
xlab("log2 fold change") +
geom_label_repel(data=myTopHits.labels, aes(x=log2FoldChange, y=-log10(padjbound), label=LLgeneAbbrev), force = 2, nudge_y = -1, size = 2.5, max.overlaps = Inf, show.legend = FALSE, color = "black") + #label selected genes
theme_bw() +
scale_color_manual(values = c("#AADC32FF", "#27AD81FF", "#472D7BFF", "darkgrey"), name = "DEAF1 motif/ \n binding near TSS") +
scale_shape_manual(values=c(4, 16))
deaf1_2dpf_c207y_volcano#filter comparisons to only include t238p hom v wt comparison
myTopHits.df <- deaf1DEGdata %>% dplyr::filter(comparison %in% c("dt238p_homvwt"))
#add column about whether chr is chr25
myTopHits.df <- myTopHits.df %>% dplyr::mutate(chrom = replace_na(chrom, "none"))
myTopHits.df <- myTopHits.df %>% mutate(chromosome = ifelse(chrom == "chr25", "chromosome 25", "other chromosome"))
#change order that points are plotted
myTopHits.df <- myTopHits.df %>% arrange(match(bindingsite, c("neither", "ChIP-Seq peak", "motif near TSS", "both")), desc(bindingsite))
genestolabel <- c("khk", "sec22a", "hdac10", "slc25a22a", "deaf1", "atxn7l", "brd9", "myo6a", "chchd3b", "unc45a", "tgfbr1b", "nuak1b", "mta1", "calm1a", "id3", "zgc:163040", "cdc27")
#subset to only include labeled genes
myTopHits.labels <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% genestolabel)
#make the plot
deaf1_2dpf_t238p_volcano <- ggplot(myTopHits.df) +
aes(y=-log10(padjbound), x=log2FoldChange, text = paste("Symbol:", LLgeneAbbrev), color=bindingsite, shape=chromosome) +
annotate("rect", xmin = 1, xmax = 5, ymin = -log10(0.05), ymax = 10,
alpha=.15, fill="grey") +
annotate("rect", xmin = -1, xmax = -5, ymin = -log10(0.05), ymax = 10,
alpha=.15, fill="grey") +
geom_point(size=2) +
theme_bw() +
coord_cartesian(xlim = c(-6, 6), ylim = c(-0.5, 10.5), expand = FALSE) +
ylab("-log10(padj)") +
xlab("log2 fold change") +
geom_label_repel(data=myTopHits.labels, aes(x=log2FoldChange, y=-log10(padjbound), label=LLgeneAbbrev), force = 2, nudge_y = -1, size = 2.5, max.overlaps = Inf, show.legend = FALSE, color = "black") + #label selected genes
theme_bw() +
scale_color_manual(values = c("#AADC32FF", "#27AD81FF", "#472D7BFF", "darkgrey"), name = "DEAF1 motif/ \n binding near TSS") +
scale_shape_manual(values=c(4, 16))
deaf1_2dpf_t238p_volcano#pick out a genes to plot
geneofinterest <- c("deaf1", "cdkn1bb", "eif4g3b", "eif4g3a", "rac3a", "rac3b", "khk", "sec22a", "creb1b", "nfyal", "atxn7l1", "cdc27", "hdac10", "cdc27")
#make new columns with mean counts of wt
genecountslogfc <- genecounts
genecountslogfc$c208yavg <- rowMeans(genecountslogfc[,12:16])
genecountslogfc$t238pavg <- rowMeans(genecountslogfc[,17:20])
genecountslogfc$d23bpavg <- rowMeans(genecountslogfc[,33:37])
genecountslogfc$d23bpavg6dpf <- rowMeans(genecountslogfc[,53:56])
#divide columns by mean counts of wt
genecountslogfc <- genecountslogfc %>% mutate(across(colnames(genecountslogfc)[4:16], function(x) x/c208yavg))
genecountslogfc <- genecountslogfc %>% mutate(across(colnames(genecountslogfc)[17:29], function(x) x/t238pavg))
genecountslogfc <- genecountslogfc %>% mutate(across(colnames(genecountslogfc)[30:42], function(x) x/d23bpavg))
genecountslogfc <- genecountslogfc %>% mutate(across(colnames(genecountslogfc)[43:56], function(x) x/d23bpavg6dpf))
#pull out data for a select gene
generepdata <- genecountslogfc %>% dplyr::filter(LLgeneAbbrev %in% geneofinterest)
#get rid of unnecessary columns
generepdata <- generepdata[,c(1,4:56)]
#pivot longer to make tidy
generepdata <- generepdata %>% pivot_longer(!LLgeneAbbrev, names_to = "sample", values_to = "foldchange")
#add condition information based on sampleName
generepdata$condition <- generepdata$sample
#substitute condition name based on replicate
generepdata <- generepdata %>% mutate(condition = str_replace_all(condition, c("^chet.*"="c207y_het_2dpf", "^chom.*"="c207y_hom_2dpf", "^cwt.*" = "c207y_wt_2dpf", "^twt.*" = "t238p_wt_2dpf", "^thet.*" = "t238p_het_2dpf", "^thom.*" = "t238p_hom_2dpf", "^dhet.*" = "23d46i_het_2dpf", "^dwt.*" = "23d46i_wt_2dpf", "^dhom.*" = "23d46i_hom_2dpf", "^het.*" = "23d46i_het_6dpf", "^hom.*" = "23d46i_hom_6dpf", "^wt.*" = "23d46i_wt_6dpf")))
#split condition into three columns
generepdata <- generepdata %>% separate(condition, sep = "_", remove = FALSE, c('mutation', 'genotype', 'age'))
#make genotype a factor
generepdata$genotype <- factor(generepdata$genotype, levels = c("wt", "het", "hom"))
#make plots manually split by age
colors <- c("#472D7BFF", "#FDE725FF", "#21908CFF") #set colors
#make an empty list
plot_list = list()
#make all the plots
for (z in 1:length(geneofinterest)) {
genedata2dpf <- generepdata %>% dplyr::filter(LLgeneAbbrev == geneofinterest[z] & age =="2dpf")
plot <- ggplot(genedata2dpf, aes(fill=genotype, y=foldchange, x=mutation)) +
geom_bar(position="dodge", stat="summary") +
geom_point(position = position_dodge(width = .9)) +
labs(title = paste(genedata2dpf$LLgeneAbbrev[z], ": 2dpf")) +
ylab("fold change") +
theme_bw() +
scale_fill_manual(values = colors)
genedata23bp <- generepdata %>% dplyr::filter(LLgeneAbbrev == geneofinterest[z] & mutation =="23d46i")
yrange <- layer_scales(plot)$y$range$range
plot1 <- ggplot(genedata23bp, aes(fill=genotype, y=foldchange, x=age)) +
geom_bar(position="dodge", stat="summary") +
geom_point(position = position_dodge(width = .9)) +
labs(title = paste(genedata2dpf$LLgeneAbbrev[z], ": 23d46i")) +
ylab("fold change") +
ylim(yrange[1], yrange[2]) +
theme_bw() +
scale_fill_manual(values = colors)
plot_list[[z]] <- plot_grid(plot + theme(legend.position="none"), plot1 )
}
# Save plots to svg Makes a separate file for each plot.
for (i in 1:length(geneofinterest)) {
file_name = paste("deaf1_foldchange_", geneofinterest[i], ".svg", sep="")
svglite(file_name)
print(plot_list[[i]])
dev.off()
} #subset deaf1DEGdata to only include comparison, logfc, gene name, and pvalue
deaf1DEGdata_scatter <- deaf1DEGdata %>% dplyr::select(log2FoldChange, pvaluebound, chrom, bindingsite, comparison, LLgeneAbbrev)
#add column about whether chr is chr25
deaf1DEGdata_scatter <- deaf1DEGdata_scatter %>% dplyr::mutate(chrom = replace_na(chrom, "none"))
deaf1DEGdata_scatter <- deaf1DEGdata_scatter %>% mutate(chromosome = ifelse(chrom == "chr25", "chromosome 25", "other chromosome"))
#find duplicates
duplicategenes <- dplyr::filter(deaf1DEGdata_scatter %>%
distinct() %>%
group_by(LLgeneAbbrev, comparison) %>%
dplyr::count(), n !=1)$LLgeneAbbrev
#remove duplicate rows from deaf1DEGdata_scatter
deaf1DEGdata_scatter <- deaf1DEGdata_scatter %>% dplyr::filter(!LLgeneAbbrev %in% duplicategenes)
#use pivot wider to make untidy table
deaf1DEGdata_scatter <- deaf1DEGdata_scatter %>% distinct() %>%
pivot_wider(names_from = comparison, values_from = c(log2FoldChange, pvaluebound), values_fill=NA)
#add rownames
row.names(deaf1DEGdata_scatter) <- deaf1DEGdata_scatter$LLgeneAbbrev
#genes to label
genestolabel <- c("LOC110438378", "ndufa9b", "muc5d", "sec22a", "ccl39.3", "muc5.1", "khk", "ambra1b", "tmem138", "col21a1", "CABZ01039863.1", "atxn7l1", "znf319b", "ccl39.6", "ccl39.7", "XLOC_042595", "nfyal", "cyp27b1")
#subset deaf1DEGdata_scatter to only include labeled genes
myTopHits.labels <- deaf1DEGdata_scatter %>% dplyr::filter(LLgeneAbbrev %in% genestolabel)
#change order that points are plotted
deaf1DEGdata_scatter <- deaf1DEGdata_scatter %>% arrange(match(bindingsite, c("neither", "ChIP-Seq peak", "motif near TSS", "both")), desc(bindingsite))
scatterplot_deaf1_ages <- ggplot() +
geom_point(data=deaf1DEGdata_scatter, aes(x=log2FoldChange_d23bp_homvwt, y=log2FoldChange_d23bp_homvwt_6dpf, text = paste("Symbol:", LLgeneAbbrev), color=bindingsite, shape=chromosome)) + #plot data points
geom_hline(yintercept=0, linetype = 'dotted') + #add horizontal line
geom_vline(xintercept=0, linetype = 'dotted') + #add vertical line
theme_bw() +
scale_color_manual(values = c("#AADC32FF", "#27AD81FF", "#472D7BFF", "darkgrey"), name = "DEAF1 motif/ \n binding near TSS") +
scale_shape_manual(values=c(4, 16)) +
geom_label_repel(data=myTopHits.labels, aes(x=log2FoldChange_d23bp_homvwt, y=log2FoldChange_d23bp_homvwt_6dpf, label=LLgeneAbbrev), force = 1, nudge_y = .5, size = 2.5, max.overlaps = Inf, show.legend = FALSE, color = "black") + #label selected genes
ylab("log2 fold change 23d46i 6dpf") +
xlab("log2 fold change 23d46i 2dpf")
scatterplot_deaf1_ages#genes to label
genestolabel <- c("LOC110438378", "CABZ01039863.1", "atxn7l1", "khk", "muc5.1", "sec22a", "nfyal", "unc45a", "apoa1b", "deaf1", "zgc:163040")
#subset deaf1DEGdata_scatter to only include labeled genes
myTopHits.labels <- deaf1DEGdata_scatter %>% dplyr::filter(LLgeneAbbrev %in% genestolabel)
scatterplot_deaf1_t238p <- ggplot() +
geom_point(data=deaf1DEGdata_scatter, aes(x=log2FoldChange_d23bp_homvwt, y=log2FoldChange_dt238p_homvwt, text = paste("Symbol:", LLgeneAbbrev), color=bindingsite, shape=chromosome)) + #plot data points
geom_hline(yintercept=0, linetype = 'dotted') + #add horizontal line
geom_vline(xintercept=0, linetype = 'dotted') + #add vertical line
theme_bw() +
scale_color_manual(values = c("#AADC32FF", "#27AD81FF", "#472D7BFF", "darkgrey"), name = "DEAF1 motif/ \n binding near TSS") +
scale_shape_manual(values=c(4, 16)) +
geom_label_repel(data=myTopHits.labels, aes(x=log2FoldChange_d23bp_homvwt, y=log2FoldChange_dt238p_homvwt, label=LLgeneAbbrev), force = 1, nudge_y = .5, size = 2.5, max.overlaps = Inf, show.legend = FALSE, color = "black") + #label selected genes
ylab("log2 fold change t238p 2dpf") +
xlab("log2 fold change 23d46i 2dpf")
scatterplot_deaf1_t238p#genes to label
genestolabel <- c("CABZ01039863.1", "atxn7l1", "khk", "muc5.1", "sec22a", "nfyal", "deaf1", "zgc:163040", "cav1", "CABZ01039859.1", "actn3a")
#subset deaf1DEGdata_scatter to only include labeled genes
myTopHits.labels <- deaf1DEGdata_scatter %>% dplyr::filter(LLgeneAbbrev %in% genestolabel)
scatterplot_deaf1_c207y <- ggplot() +
geom_point(data=deaf1DEGdata_scatter, aes(x=log2FoldChange_d23bp_homvwt, y=log2FoldChange_dc207y_homvwt, text = paste("Symbol:", LLgeneAbbrev), color=bindingsite, shape=chromosome)) + #plot data points
geom_hline(yintercept=0, linetype = 'dotted') + #add horizontal line
geom_vline(xintercept=0, linetype = 'dotted') + #add vertical line
theme_bw() +
scale_color_manual(values = c("#AADC32FF", "#27AD81FF", "#472D7BFF", "darkgrey"), name = "DEAF1 motif/ \n binding near TSS") +
scale_shape_manual(values=c(4, 16)) +
geom_label_repel(data=myTopHits.labels, aes(x=log2FoldChange_d23bp_homvwt, y=log2FoldChange_dc207y_homvwt, label=LLgeneAbbrev), force = 1, nudge_y = .5, size = 2.5, max.overlaps = Inf, show.legend = FALSE, color = "black") + #label selected genes
ylab("log2 fold change c207y 2dpf") +
xlab("log2 fold change 23d46i 2dpf")
scatterplot_deaf1_c207yGet promoter sequences for dysregulated genes.
The list of shared upregulated and downregulated genes (venntableup and venntable down) was submitted to http://rsat.sb-roscoff.fr/retrieve-ensembl-seq_form.cgi to retrieve promoter sequences. Query organism was zebrafish. Ensembl database version was 110. Single organism was selected. Sequence type was upstream/downstream. Mask repeats and mask coding sequences were deselected. Avoid redundant sequences due to alternative transcripts was selected. Sequence position was upstream -300 to -1. Relative to feature was gene. Prevent overlap with neighboring was none. Sequences for upregulated genes are in the file: retrieve-ensembl-seq_2023-11-15.224451_lUoDXG and sequences for downregulated genes are in the file: retrieve-ensembl-seq_2023-11-15.224849_6kZwHv Sequences for both are in the file: retrieve-ensembl-seq_2023-11-15.225255_nHSvhx
Packages and versions necessary to reproduce the results in this report.
## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] svglite_2.1.2
## [2] ggraph_2.1.0
## [3] colorspace_2.1-0
## [4] ggpubr_0.6.0
## [5] lattice_0.22-5
## [6] org.Dr.eg.db_3.18.0
## [7] ensembldb_2.26.0
## [8] AnnotationFilter_1.26.0
## [9] GenomicFeatures_1.54.1
## [10] ChIPseeker_1.38.0
## [11] orthogene_1.8.0
## [12] plotly_4.10.3
## [13] BaseSet_0.9.0
## [14] ontologyIndex_2.11
## [15] enrichplot_1.22.0
## [16] msigdbr_7.5.1
## [17] clusterProfiler_4.10.0
## [18] gprofiler2_0.2.2
## [19] GSVA_1.50.0
## [20] GSEABase_1.64.0
## [21] graph_1.80.0
## [22] annotate_1.80.0
## [23] XML_3.99-0.14
## [24] AnnotationDbi_1.64.0
## [25] DT_0.30
## [26] RColorBrewer_1.1-3
## [27] ggrepel_0.9.4
## [28] scales_1.2.1
## [29] viridis_0.6.4
## [30] viridisLite_0.4.2
## [31] cowplot_1.1.1
## [32] DESeq2_1.42.0
## [33] SummarizedExperiment_1.32.0
## [34] Biobase_2.62.0
## [35] MatrixGenerics_1.14.0
## [36] matrixStats_1.1.0
## [37] GenomicRanges_1.54.1
## [38] GenomeInfoDb_1.38.0
## [39] IRanges_2.36.0
## [40] S4Vectors_0.40.1
## [41] BiocGenerics_0.48.0
## [42] lubridate_1.9.3
## [43] forcats_1.0.0
## [44] stringr_1.5.0
## [45] dplyr_1.1.3
## [46] purrr_1.0.2
## [47] readr_2.1.4
## [48] tidyr_1.3.0
## [49] tibble_3.2.1
## [50] ggplot2_3.4.4
## [51] tidyverse_2.0.0
## [52] knitr_1.45
## [53] tinytex_0.48
## [54] rmarkdown_2.25
## [55] TxDb.Hsapiens.UCSC.hg38.knownGene_3.18.0
## [56] EnsDb.Hsapiens.v75_2.99.0
##
## loaded via a namespace (and not attached):
## [1] ProtGenerics_1.34.0
## [2] fs_1.6.3
## [3] bitops_1.0-7
## [4] HDO.db_0.99.1
## [5] httr_1.4.7
## [6] tools_4.3.1
## [7] backports_1.4.1
## [8] utf8_1.2.4
## [9] R6_2.5.1
## [10] HDF5Array_1.30.0
## [11] lazyeval_0.2.2
## [12] rhdf5filters_1.14.0
## [13] withr_2.5.2
## [14] prettyunits_1.2.0
## [15] gridExtra_2.3
## [16] cli_3.6.1
## [17] scatterpie_0.2.1
## [18] labeling_0.4.3
## [19] sass_0.4.7
## [20] systemfonts_1.0.5
## [21] Rsamtools_2.18.0
## [22] yulab.utils_0.1.0
## [23] gson_0.1.0
## [24] DOSE_3.28.0
## [25] plotrix_3.8-2
## [26] rstudioapi_0.15.0
## [27] RSQLite_2.3.2
## [28] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [29] generics_0.1.3
## [30] gridGraphics_0.5-1
## [31] BiocIO_1.12.0
## [32] crosstalk_1.2.0
## [33] vroom_1.6.4
## [34] gtools_3.9.4
## [35] car_3.1-2
## [36] homologene_1.4.68.19.3.27
## [37] GO.db_3.18.0
## [38] Matrix_1.6-1.1
## [39] fansi_1.0.5
## [40] abind_1.4-5
## [41] lifecycle_1.0.4
## [42] yaml_2.3.7
## [43] carData_3.0-5
## [44] gplots_3.1.3
## [45] rhdf5_2.46.0
## [46] qvalue_2.34.0
## [47] SparseArray_1.2.0
## [48] BiocFileCache_2.10.1
## [49] grid_4.3.1
## [50] blob_1.2.4
## [51] promises_1.2.1
## [52] crayon_1.5.2
## [53] beachmat_2.18.0
## [54] KEGGREST_1.42.0
## [55] pillar_1.9.0
## [56] fgsea_1.28.0
## [57] boot_1.3-28.1
## [58] rjson_0.2.21
## [59] codetools_0.2-19
## [60] fastmatch_1.1-4
## [61] glue_1.6.2
## [62] ggfun_0.1.3
## [63] data.table_1.14.8
## [64] vctrs_0.6.4
## [65] png_0.1-8
## [66] treeio_1.26.0
## [67] gtable_0.3.4
## [68] cachem_1.0.8
## [69] xfun_0.41
## [70] S4Arrays_1.2.0
## [71] mime_0.12
## [72] tidygraph_1.2.3
## [73] SingleCellExperiment_1.24.0
## [74] interactiveDisplayBase_1.40.0
## [75] ellipsis_0.3.2
## [76] nlme_3.1-163
## [77] ggtree_3.10.0
## [78] bit64_4.0.5
## [79] progress_1.2.2
## [80] filelock_1.0.2
## [81] bslib_0.5.1
## [82] irlba_2.3.5.1
## [83] KernSmooth_2.23-22
## [84] DBI_1.1.3
## [85] tidyselect_1.2.0
## [86] bit_4.0.5
## [87] compiler_4.3.1
## [88] curl_5.1.0
## [89] xml2_1.3.5
## [90] DelayedArray_0.28.0
## [91] shadowtext_0.1.2
## [92] rtracklayer_1.62.0
## [93] caTools_1.18.2
## [94] rappdirs_0.3.3
## [95] digest_0.6.33
## [96] XVector_0.42.0
## [97] htmltools_0.5.6.1
## [98] pkgconfig_2.0.3
## [99] sparseMatrixStats_1.14.0
## [100] highr_0.10
## [101] dbplyr_2.4.0
## [102] fastmap_1.1.1
## [103] rlang_1.1.1
## [104] htmlwidgets_1.6.2
## [105] shiny_1.7.5.1
## [106] DelayedMatrixStats_1.24.0
## [107] farver_2.1.1
## [108] jquerylib_0.1.4
## [109] jsonlite_1.8.7
## [110] BiocParallel_1.36.0
## [111] GOSemSim_2.28.0
## [112] BiocSingular_1.18.0
## [113] RCurl_1.98-1.12
## [114] magrittr_2.0.3
## [115] GenomeInfoDbData_1.2.11
## [116] ggplotify_0.1.2
## [117] patchwork_1.1.3
## [118] Rhdf5lib_1.24.0
## [119] munsell_0.5.0
## [120] Rcpp_1.0.11
## [121] ggnewscale_0.4.9
## [122] ape_5.7-1
## [123] babelgene_22.9
## [124] stringi_1.8.1
## [125] zlibbioc_1.48.0
## [126] MASS_7.3-60
## [127] AnnotationHub_3.10.0
## [128] plyr_1.8.9
## [129] parallel_4.3.1
## [130] HPO.db_0.99.2
## [131] Biostrings_2.70.1
## [132] graphlayouts_1.0.1
## [133] splines_4.3.1
## [134] hms_1.1.3
## [135] locfit_1.5-9.8
## [136] igraph_1.5.1
## [137] ggsignif_0.6.4
## [138] reshape2_1.4.4
## [139] biomaRt_2.58.0
## [140] ScaledMatrix_1.10.0
## [141] BiocVersion_3.18.0
## [142] evaluate_0.23
## [143] BiocManager_1.30.22
## [144] tzdb_0.4.0
## [145] tweenr_2.0.2
## [146] httpuv_1.6.12
## [147] grr_0.9.5
## [148] polyclip_1.10-6
## [149] ggforce_0.4.1
## [150] rsvd_1.0.5
## [151] broom_1.0.5
## [152] xtable_1.8-4
## [153] restfulr_0.0.15
## [154] tidytree_0.4.5
## [155] MPO.db_0.99.7
## [156] rstatix_0.7.2
## [157] later_1.3.1
## [158] snow_0.4-4
## [159] aplot_0.2.2
## [160] GenomicAlignments_1.38.0
## [161] memoise_2.0.1
## [162] timechange_0.2.0